/* ----------------------------------------------------------------- */ /* The HMM-Based Speech Synthesis Engine "hts_engine API" */ /* developed by HTS Working Group */ /* http://hts-engine.sourceforge.net/ */ /* ----------------------------------------------------------------- */ /* */ /* Copyright (c) 2001-2015 Nagoya Institute of Technology */ /* Department of Computer Science */ /* */ /* 2001-2008 Tokyo Institute of Technology */ /* Interdisciplinary Graduate School of */ /* Science and Engineering */ /* */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* - Redistributions of source code must retain the above copyright */ /* notice, this list of conditions and the following disclaimer. */ /* - Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials provided */ /* with the distribution. */ /* - Neither the name of the HTS working group nor the names of its */ /* contributors may be used to endorse or promote products derived */ /* from this software without specific prior written permission. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */ /* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */ /* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */ /* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */ /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */ /* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */ /* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */ /* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */ /* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* ----------------------------------------------------------------- */ #ifndef HTS_ENGINE_C #define HTS_ENGINE_C #ifdef __cplusplus #define HTS_ENGINE_C_START extern "C" { #define HTS_ENGINE_C_END } #else #define HTS_ENGINE_C_START #define HTS_ENGINE_C_END #endif /* __CPLUSPLUS */ HTS_ENGINE_C_START; #include /* for atof() */ #include /* for strcpy() */ #include /* for pow() */ /* hts_engine libraries */ #include "HTS_hidden.h" /* HTS_Engine_initialize: initialize engine */ void HTS_Engine_initialize(HTS_Engine * engine) { /* global */ engine->condition.sampling_frequency = 0; engine->condition.fperiod = 0; engine->condition.audio_buff_size = 0; engine->condition.stop = FALSE; engine->condition.volume = 1.0; engine->condition.msd_threshold = NULL; engine->condition.gv_weight = NULL; /* duration */ engine->condition.speed = 1.0; engine->condition.phoneme_alignment_flag = FALSE; /* spectrum */ engine->condition.stage = 0; engine->condition.use_log_gain = FALSE; engine->condition.alpha = 0.0; engine->condition.beta = 0.0; /* log F0 */ engine->condition.additional_half_tone = 0.0; /* interpolation weights */ engine->condition.duration_iw = NULL; engine->condition.parameter_iw = NULL; engine->condition.gv_iw = NULL; /* initialize audio */ HTS_Audio_initialize(&engine->audio); /* initialize model set */ HTS_ModelSet_initialize(&engine->ms); /* initialize label list */ HTS_Label_initialize(&engine->label); /* initialize state sequence set */ HTS_SStreamSet_initialize(&engine->sss); /* initialize pstream set */ HTS_PStreamSet_initialize(&engine->pss); /* initialize gstream set */ HTS_GStreamSet_initialize(&engine->gss); } /* HTS_Engine_load: load HTS voices */ HTS_Boolean HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices) { size_t i, j; size_t nstream; double average_weight; const char *option, *find; /* reset engine */ HTS_Engine_clear(engine); /* load voices */ if (HTS_ModelSet_load(&engine->ms, voices, num_voices) != TRUE) { HTS_Engine_clear(engine); return FALSE; } nstream = HTS_ModelSet_get_nstream(&engine->ms); average_weight = 1.0 / num_voices; /* global */ engine->condition.sampling_frequency = HTS_ModelSet_get_sampling_frequency(&engine->ms); engine->condition.fperiod = HTS_ModelSet_get_fperiod(&engine->ms); engine->condition.msd_threshold = (double *) HTS_calloc(nstream, sizeof(double)); for (i = 0; i < nstream; i++) engine->condition.msd_threshold[i] = 0.5; engine->condition.gv_weight = (double *) HTS_calloc(nstream, sizeof(double)); for (i = 0; i < nstream; i++) engine->condition.gv_weight[i] = 1.0; /* spectrum */ option = HTS_ModelSet_get_option(&engine->ms, 0); find = strstr(option, "GAMMA="); if (find != NULL) engine->condition.stage = (size_t) atoi(&find[strlen("GAMMA=")]); find = strstr(option, "LN_GAIN="); if (find != NULL) engine->condition.use_log_gain = atoi(&find[strlen("LN_GAIN=")]) == 1 ? TRUE : FALSE; find = strstr(option, "ALPHA="); if (find != NULL) engine->condition.alpha = atof(&find[strlen("ALPHA=")]); /* interpolation weights */ engine->condition.duration_iw = (double *) HTS_calloc(num_voices, sizeof(double)); for (i = 0; i < num_voices; i++) engine->condition.duration_iw[i] = average_weight; engine->condition.parameter_iw = (double **) HTS_calloc(num_voices, sizeof(double *)); for (i = 0; i < num_voices; i++) { engine->condition.parameter_iw[i] = (double *) HTS_calloc(nstream, sizeof(double)); for (j = 0; j < nstream; j++) engine->condition.parameter_iw[i][j] = average_weight; } engine->condition.gv_iw = (double **) HTS_calloc(num_voices, sizeof(double *)); for (i = 0; i < num_voices; i++) { engine->condition.gv_iw[i] = (double *) HTS_calloc(nstream, sizeof(double)); for (j = 0; j < nstream; j++) engine->condition.gv_iw[i][j] = average_weight; } return TRUE; } /* HTS_Engine_set_sampling_frequency: set sampling frequency */ void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i) { if (i < 1) i = 1; engine->condition.sampling_frequency = i; HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size); } /* HTS_Engine_get_sampling_frequency: get sampling frequency */ size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine) { return engine->condition.sampling_frequency; } /* HTS_Engine_set_fperiod: set frame period */ void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i) { if (i < 1) i = 1; engine->condition.fperiod = i; } /* HTS_Engine_get_fperiod: get frame period */ size_t HTS_Engine_get_fperiod(HTS_Engine * engine) { return engine->condition.fperiod; } /* HTS_Engine_set_audio_buff_size: set audio buffer size */ void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i) { engine->condition.audio_buff_size = i; HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size); } /* HTS_Engine_get_audio_buff_size: get audio buffer size */ size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine) { return engine->condition.audio_buff_size; } /* HTS_Engine_set_stop_flag: set stop flag */ void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b) { engine->condition.stop = b; } /* HTS_Engine_get_stop_flag: get stop flag */ HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine) { return engine->condition.stop; } /* HTS_Engine_set_volume: set volume in db */ void HTS_Engine_set_volume(HTS_Engine * engine, double f) { engine->condition.volume = exp(f * DB); } /* HTS_Engine_get_volume: get volume in db */ double HTS_Engine_get_volume(HTS_Engine * engine) { return log(engine->condition.volume) / DB; } /* HTS_Egnine_set_msd_threshold: set MSD threshold */ void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f) { if (f < 0.0) f = 0.0; if (f > 1.0) f = 1.0; engine->condition.msd_threshold[stream_index] = f; } /* HTS_Engine_get_msd_threshold: get MSD threshold */ double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index) { return engine->condition.msd_threshold[stream_index]; } /* HTS_Engine_set_gv_weight: set GV weight */ void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f) { if (f < 0.0) f = 0.0; engine->condition.gv_weight[stream_index] = f; } /* HTS_Engine_get_gv_weight: get GV weight */ double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index) { return engine->condition.gv_weight[stream_index]; } /* HTS_Engine_set_speed: set speech speed */ void HTS_Engine_set_speed(HTS_Engine * engine, double f) { if (f < 1.0E-06) f = 1.0E-06; engine->condition.speed = f; } /* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */ void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b) { engine->condition.phoneme_alignment_flag = b; } /* HTS_Engine_set_alpha: set alpha */ void HTS_Engine_set_alpha(HTS_Engine * engine, double f) { if (f < 0.0) f = 0.0; if (f > 1.0) f = 1.0; engine->condition.alpha = f; } /* HTS_Engine_get_alpha: get alpha */ double HTS_Engine_get_alpha(HTS_Engine * engine) { return engine->condition.alpha; } /* HTS_Engine_set_beta: set beta */ void HTS_Engine_set_beta(HTS_Engine * engine, double f) { if (f < 0.0) f = 0.0; if (f > 1.0) f = 1.0; engine->condition.beta = f; } /* HTS_Engine_get_beta: get beta */ double HTS_Engine_get_beta(HTS_Engine * engine) { return engine->condition.beta; } /* HTS_Engine_add_half_tone: add half tone */ void HTS_Engine_add_half_tone(HTS_Engine * engine, double f) { engine->condition.additional_half_tone = f; } /* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */ void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f) { engine->condition.duration_iw[voice_index] = f; } /* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */ double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index) { return engine->condition.duration_iw[voice_index]; } /* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */ void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f) { engine->condition.parameter_iw[voice_index][stream_index] = f; } /* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */ double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index) { return engine->condition.parameter_iw[voice_index][stream_index]; } /* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */ void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f) { engine->condition.gv_iw[voice_index][stream_index] = f; } /* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */ double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index) { return engine->condition.gv_iw[voice_index][stream_index]; } /* HTS_Engine_get_total_state: get total number of state */ size_t HTS_Engine_get_total_state(HTS_Engine * engine) { return HTS_SStreamSet_get_total_state(&engine->sss); } /* HTS_Engine_set_state_mean: set mean value of state */ void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f) { HTS_SStreamSet_set_mean(&engine->sss, stream_index, state_index, vector_index, f); } /* HTS_Engine_get_state_mean: get mean value of state */ double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index) { return HTS_SStreamSet_get_mean(&engine->sss, stream_index, state_index, vector_index); } /* HTS_Engine_get_state_duration: get state duration */ size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index) { return HTS_SStreamSet_get_duration(&engine->sss, state_index); } /* HTS_Engine_get_nvoices: get number of voices */ size_t HTS_Engine_get_nvoices(HTS_Engine * engine) { return HTS_ModelSet_get_nvoices(&engine->ms); } /* HTS_Engine_get_nstream: get number of stream */ size_t HTS_Engine_get_nstream(HTS_Engine * engine) { return HTS_ModelSet_get_nstream(&engine->ms); } /* HTS_Engine_get_nstate: get number of state */ size_t HTS_Engine_get_nstate(HTS_Engine * engine) { return HTS_ModelSet_get_nstate(&engine->ms); } /* HTS_Engine_get_fullcontext_label_format: get full context label format */ const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine) { return HTS_ModelSet_get_fullcontext_label_format(&engine->ms); } /* HTS_Engine_get_fullcontext_label_version: get full context label version */ const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine) { return HTS_ModelSet_get_fullcontext_label_version(&engine->ms); } /* HTS_Engine_get_total_frame: get total number of frame */ size_t HTS_Engine_get_total_frame(HTS_Engine * engine) { return HTS_GStreamSet_get_total_frame(&engine->gss); } /* HTS_Engine_get_nsamples: get number of samples */ size_t HTS_Engine_get_nsamples(HTS_Engine * engine) { return HTS_GStreamSet_get_total_nsamples(&engine->gss); } /* HTS_Engine_get_generated_parameter: output generated parameter */ double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index) { return HTS_GStreamSet_get_parameter(&engine->gss, stream_index, frame_index, vector_index); } /* HTS_Engine_get_generated_speech: output generated speech */ double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index) { return HTS_GStreamSet_get_speech(&engine->gss, index); } /* HTS_Engine_generate_state_sequence: genereate state sequence (1st synthesis step) */ static HTS_Boolean HTS_Engine_generate_state_sequence(HTS_Engine * engine) { size_t i, state_index, model_index; double f; if (HTS_SStreamSet_create(&engine->sss, &engine->ms, &engine->label, engine->condition.phoneme_alignment_flag, engine->condition.speed, engine->condition.duration_iw, engine->condition.parameter_iw, engine->condition.gv_iw) != TRUE) { HTS_Engine_refresh(engine); return FALSE; } if (engine->condition.additional_half_tone != 0.0) { state_index = 0; model_index = 0; for (i = 0; i < HTS_Engine_get_total_state(engine); i++) { f = HTS_Engine_get_state_mean(engine, 1, i, 0); f += engine->condition.additional_half_tone * HALF_TONE; if (f < MIN_LF0) f = MIN_LF0; else if (f > MAX_LF0) f = MAX_LF0; HTS_Engine_set_state_mean(engine, 1, i, 0, f); state_index++; if (state_index >= HTS_Engine_get_nstate(engine)) { state_index = 0; model_index++; } } } return TRUE; } /* HTS_Engine_generate_state_sequence_from_fn: genereate state sequence from file name (1st synthesis step) */ HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn) { HTS_Engine_refresh(engine); HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn); return HTS_Engine_generate_state_sequence(engine); } /* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from strings (1st synthesis step) */ HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines) { HTS_Engine_refresh(engine); HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines); return HTS_Engine_generate_state_sequence(engine); } /* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */ HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine) { return HTS_PStreamSet_create(&engine->pss, &engine->sss, engine->condition.msd_threshold, engine->condition.gv_weight); } /* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */ HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine) { return HTS_GStreamSet_create(&engine->gss, &engine->pss, engine->condition.stage, engine->condition.use_log_gain, engine->condition.sampling_frequency, engine->condition.fperiod, engine->condition.alpha, engine->condition.beta, &engine->condition.stop, engine->condition.volume, engine->condition.audio_buff_size > 0 ? &engine->audio : NULL); } /* HTS_Engine_synthesize: synthesize speech */ static HTS_Boolean HTS_Engine_synthesize(HTS_Engine * engine) { if (HTS_Engine_generate_state_sequence(engine) != TRUE) { HTS_Engine_refresh(engine); return FALSE; } if (HTS_Engine_generate_parameter_sequence(engine) != TRUE) { HTS_Engine_refresh(engine); return FALSE; } if (HTS_Engine_generate_sample_sequence(engine) != TRUE) { HTS_Engine_refresh(engine); return FALSE; } return TRUE; } /* HTS_Engine_synthesize_from_fn: synthesize speech from file name */ HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn) { HTS_Engine_refresh(engine); HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn); return HTS_Engine_synthesize(engine); } /* HTS_Engine_synthesize_from_strings: synthesize speech from strings */ HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines) { HTS_Engine_refresh(engine); HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines); return HTS_Engine_synthesize(engine); } /* HTS_Engine_save_information: save trace information */ void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp) { size_t i, j, k, l, m, n; double temp; HTS_Condition *condition = &engine->condition; HTS_ModelSet *ms = &engine->ms; HTS_Label *label = &engine->label; HTS_SStreamSet *sss = &engine->sss; HTS_PStreamSet *pss = &engine->pss; /* global parameter */ fprintf(fp, "[Global parameter]\n"); fprintf(fp, "Sampring frequency -> %8lu(Hz)\n", (unsigned long) condition->sampling_frequency); fprintf(fp, "Frame period -> %8lu(point)\n", (unsigned long) condition->fperiod); fprintf(fp, " %8.5f(msec)\n", 1e+3 * condition->fperiod / condition->sampling_frequency); fprintf(fp, "All-pass constant -> %8.5f\n", (float) condition->alpha); fprintf(fp, "Gamma -> %8.5f\n", (float) (condition->stage == 0 ? 0.0 : -1.0 / condition->stage)); if (condition->stage != 0) { if (condition->use_log_gain == TRUE) fprintf(fp, "Log gain flag -> TRUE\n"); else fprintf(fp, "Log gain flag -> FALSE\n"); } fprintf(fp, "Postfiltering coefficient -> %8.5f\n", (float) condition->beta); fprintf(fp, "Audio buffer size -> %8lu(sample)\n", (unsigned long) condition->audio_buff_size); fprintf(fp, "\n"); /* duration parameter */ fprintf(fp, "[Duration parameter]\n"); fprintf(fp, "Number of states -> %8lu\n", (unsigned long) HTS_ModelSet_get_nstate(ms)); fprintf(fp, " Interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms)); /* check interpolation */ for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++) temp += condition->duration_iw[i]; for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++) if (condition->duration_iw[i] != 0.0) condition->duration_iw[i] /= temp; for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++) fprintf(fp, " Interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) i, (float) (100 * condition->duration_iw[i])); fprintf(fp, "\n"); fprintf(fp, "[Stream parameter]\n"); for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) { /* stream parameter */ fprintf(fp, "Stream[%2lu] vector length -> %8lu\n", (unsigned long) i, (unsigned long) HTS_ModelSet_get_vector_length(ms, i)); fprintf(fp, " Dynamic window size -> %8lu\n", (unsigned long) HTS_ModelSet_get_window_size(ms, i)); /* interpolation */ fprintf(fp, " Interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms)); for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++) temp += condition->parameter_iw[j][i]; for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) if (condition->parameter_iw[j][i] != 0.0) condition->parameter_iw[j][i] /= temp; for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) fprintf(fp, " Interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->parameter_iw[j][i])); /* MSD */ if (HTS_ModelSet_is_msd(ms, i)) { /* for MSD */ fprintf(fp, " MSD flag -> TRUE\n"); fprintf(fp, " MSD threshold -> %8.5f\n", condition->msd_threshold[i]); } else { /* for non MSD */ fprintf(fp, " MSD flag -> FALSE\n"); } /* GV */ if (HTS_ModelSet_use_gv(ms, i)) { fprintf(fp, " GV flag -> TRUE\n"); fprintf(fp, " GV weight -> %8.0f(%%)\n", (float) (100 * condition->gv_weight[i])); fprintf(fp, " GV interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms)); /* interpolation */ for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++) temp += condition->gv_iw[j][i]; for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) if (condition->gv_iw[j][i] != 0.0) condition->gv_iw[j][i] /= temp; for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) fprintf(fp, " GV interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->gv_iw[j][i])); } else { fprintf(fp, " GV flag -> FALSE\n"); } } fprintf(fp, "\n"); /* generated sequence */ fprintf(fp, "[Generated sequence]\n"); fprintf(fp, "Number of HMMs -> %8lu\n", (unsigned long) HTS_Label_get_size(label)); fprintf(fp, "Number of stats -> %8lu\n", (unsigned long) HTS_Label_get_size(label) * HTS_ModelSet_get_nstate(ms)); fprintf(fp, "Length of this speech -> %8.3f(sec)\n", (float) ((double) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod / condition->sampling_frequency)); fprintf(fp, " -> %8lu(frames)\n", (unsigned long) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod); for (i = 0; i < HTS_Label_get_size(label); i++) { fprintf(fp, "HMM[%2lu]\n", (unsigned long) i); fprintf(fp, " Name -> %s\n", HTS_Label_get_string(label, i)); fprintf(fp, " Duration\n"); for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) { fprintf(fp, " Interpolation[%2lu]\n", (unsigned long) j); HTS_ModelSet_get_duration_index(ms, j, HTS_Label_get_string(label, i), &k, &l); fprintf(fp, " Tree index -> %8lu\n", (unsigned long) k); fprintf(fp, " PDF index -> %8lu\n", (unsigned long) l); } for (j = 0; j < HTS_ModelSet_get_nstate(ms); j++) { fprintf(fp, " State[%2lu]\n", (unsigned long) j + 2); fprintf(fp, " Length -> %8lu(frames)\n", (unsigned long) HTS_SStreamSet_get_duration(sss, i * HTS_ModelSet_get_nstate(ms) + j)); for (k = 0; k < HTS_ModelSet_get_nstream(ms); k++) { fprintf(fp, " Stream[%2lu]\n", (unsigned long) k); if (HTS_ModelSet_is_msd(ms, k)) { if (HTS_SStreamSet_get_msd(sss, k, i * HTS_ModelSet_get_nstate(ms) + j) > condition->msd_threshold[k]) fprintf(fp, " MSD flag -> TRUE\n"); else fprintf(fp, " MSD flag -> FALSE\n"); } for (l = 0; l < HTS_ModelSet_get_nvoices(ms); l++) { fprintf(fp, " Interpolation[%2lu]\n", (unsigned long) l); HTS_ModelSet_get_parameter_index(ms, l, k, j + 2, HTS_Label_get_string(label, i), &m, &n); fprintf(fp, " Tree index -> %8lu\n", (unsigned long) m); fprintf(fp, " PDF index -> %8lu\n", (unsigned long) n); } } } } } /* HTS_Engine_save_label: save label with time */ void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp) { size_t i, j; size_t frame, state, duration; HTS_Label *label = &engine->label; HTS_SStreamSet *sss = &engine->sss; size_t nstate = HTS_ModelSet_get_nstate(&engine->ms); double rate = engine->condition.fperiod * 1.0e+07 / engine->condition.sampling_frequency; for (i = 0, state = 0, frame = 0; i < HTS_Label_get_size(label); i++) { for (j = 0, duration = 0; j < nstate; j++) duration += HTS_SStreamSet_get_duration(sss, state++); fprintf(fp, "%lu %lu %s\n", (unsigned long) (frame * rate), (unsigned long) ((frame + duration) * rate), HTS_Label_get_string(label, i)); frame += duration; } } /* HTS_Engine_save_generated_parameter: save generated parameter */ void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp) { size_t i, j; float temp; HTS_GStreamSet *gss = &engine->gss; for (i = 0; i < HTS_GStreamSet_get_total_frame(gss); i++) for (j = 0; j < HTS_GStreamSet_get_vector_length(gss, stream_index); j++) { temp = (float) HTS_GStreamSet_get_parameter(gss, stream_index, i, j); fwrite(&temp, sizeof(float), 1, fp); } } /* HTS_Engine_save_generated_speech: save generated speech */ void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp) { size_t i; double x; short temp; HTS_GStreamSet *gss = &engine->gss; for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) { x = HTS_GStreamSet_get_speech(gss, i); if (x > 32767.0) temp = 32767; else if (x < -32768.0) temp = -32768; else temp = (short) x; fwrite(&temp, sizeof(short), 1, fp); } } /* HTS_Engine_save_riff: save RIFF format file */ void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp) { size_t i; double x; short temp; HTS_GStreamSet *gss = &engine->gss; char data_01_04[] = { 'R', 'I', 'F', 'F' }; int data_05_08 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short) + 36; char data_09_12[] = { 'W', 'A', 'V', 'E' }; char data_13_16[] = { 'f', 'm', 't', ' ' }; int data_17_20 = 16; short data_21_22 = 1; /* PCM */ short data_23_24 = 1; /* monoral */ int data_25_28 = engine->condition.sampling_frequency; int data_29_32 = engine->condition.sampling_frequency * sizeof(short); short data_33_34 = sizeof(short); short data_35_36 = (short) (sizeof(short) * 8); char data_37_40[] = { 'd', 'a', 't', 'a' }; int data_41_44 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short); /* write header */ HTS_fwrite_little_endian(data_01_04, sizeof(char), 4, fp); HTS_fwrite_little_endian(&data_05_08, sizeof(int), 1, fp); HTS_fwrite_little_endian(data_09_12, sizeof(char), 4, fp); HTS_fwrite_little_endian(data_13_16, sizeof(char), 4, fp); HTS_fwrite_little_endian(&data_17_20, sizeof(int), 1, fp); HTS_fwrite_little_endian(&data_21_22, sizeof(short), 1, fp); HTS_fwrite_little_endian(&data_23_24, sizeof(short), 1, fp); HTS_fwrite_little_endian(&data_25_28, sizeof(int), 1, fp); HTS_fwrite_little_endian(&data_29_32, sizeof(int), 1, fp); HTS_fwrite_little_endian(&data_33_34, sizeof(short), 1, fp); HTS_fwrite_little_endian(&data_35_36, sizeof(short), 1, fp); HTS_fwrite_little_endian(data_37_40, sizeof(char), 4, fp); HTS_fwrite_little_endian(&data_41_44, sizeof(int), 1, fp); /* write data */ for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) { x = HTS_GStreamSet_get_speech(gss, i); if (x > 32767.0) temp = 32767; else if (x < -32768.0) temp = -32768; else temp = (short) x; HTS_fwrite_little_endian(&temp, sizeof(short), 1, fp); } } /* HTS_Engine_refresh: free model per one time synthesis */ void HTS_Engine_refresh(HTS_Engine * engine) { /* free generated parameter stream set */ HTS_GStreamSet_clear(&engine->gss); /* free parameter stream set */ HTS_PStreamSet_clear(&engine->pss); /* free state stream set */ HTS_SStreamSet_clear(&engine->sss); /* free label list */ HTS_Label_clear(&engine->label); /* stop flag */ engine->condition.stop = FALSE; } /* HTS_Engine_clear: free engine */ void HTS_Engine_clear(HTS_Engine * engine) { size_t i; if (engine->condition.msd_threshold != NULL) HTS_free(engine->condition.msd_threshold); if (engine->condition.duration_iw != NULL) HTS_free(engine->condition.duration_iw); if (engine->condition.gv_weight != NULL) HTS_free(engine->condition.gv_weight); if (engine->condition.parameter_iw != NULL) { for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++) HTS_free(engine->condition.parameter_iw[i]); HTS_free(engine->condition.parameter_iw); } if (engine->condition.gv_iw != NULL) { for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++) HTS_free(engine->condition.gv_iw[i]); HTS_free(engine->condition.gv_iw); } HTS_ModelSet_clear(&engine->ms); HTS_Audio_clear(&engine->audio); HTS_Engine_initialize(engine); } HTS_ENGINE_C_END; #endif /* !HTS_ENGINE_C */