flightgear/3rdparty/hts_engine_API/bin/hts_engine.c
2022-10-20 20:29:11 +08:00

345 lines
13 KiB
C

/* ----------------------------------------------------------------- */
/* The HMM-Based Speech Synthesis Engine "hts_engine API" */
/* developed by HTS Working Group */
/* http://hts-engine.sourceforge.net/ */
/* ----------------------------------------------------------------- */
/* */
/* Copyright (c) 2001-2015 Nagoya Institute of Technology */
/* Department of Computer Science */
/* */
/* 2001-2008 Tokyo Institute of Technology */
/* Interdisciplinary Graduate School of */
/* Science and Engineering */
/* */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* - Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* - Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials provided */
/* with the distribution. */
/* - Neither the name of the HTS working group nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
/* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
/* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
/* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
/* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* ----------------------------------------------------------------- */
#ifndef HTS_ENGINE_C
#define HTS_ENGINE_C
#ifdef __cplusplus
#define HTS_ENGINE_C_START extern "C" {
#define HTS_ENGINE_C_END }
#else
#define HTS_ENGINE_C_START
#define HTS_ENGINE_C_END
#endif /* __CPLUSPLUS */
HTS_ENGINE_C_START;
#include <stdlib.h>
#include "HTS_engine.h"
/* usage: output usage */
void usage(void)
{
fprintf(stderr, "%s\n", HTS_COPYRIGHT);
fprintf(stderr, "hts_engine - The HMM-based speech synthesis engine \"hts_engine API\"\n");
fprintf(stderr, "\n");
fprintf(stderr, " usage:\n");
fprintf(stderr, " hts_engine [ options ] [ infile ]\n");
fprintf(stderr, " options: [ def][ min-- max]\n");
fprintf(stderr, " -m htsvoice : HTS voice files [ N/A]\n");
fprintf(stderr, " -od s : filename of output label with duration [ N/A]\n");
fprintf(stderr, " -om s : filename of output spectrum [ N/A]\n");
fprintf(stderr, " -of s : filename of output log F0 [ N/A]\n");
fprintf(stderr, " -ol s : filename of output low-pass filter [ N/A]\n");
fprintf(stderr, " -or s : filename of output raw audio (generated speech) [ N/A]\n");
fprintf(stderr, " -ow s : filename of output wav audio (generated speech) [ N/A]\n");
fprintf(stderr, " -ot s : filename of output trace information [ N/A]\n");
fprintf(stderr, " -vp : use phoneme alignment for duration [ N/A]\n");
fprintf(stderr, " -i i f1 .. fi : enable interpolation & specify number(i),coefficient(f) [ N/A]\n");
fprintf(stderr, " -s i : sampling frequency [ auto][ 1-- ]\n");
fprintf(stderr, " -p i : frame period (point) [ auto][ 1-- ]\n");
fprintf(stderr, " -a f : all-pass constant [ auto][ 0.0-- 1.0]\n");
fprintf(stderr, " -b f : postfiltering coefficient [ 0.0][ 0.0-- 1.0]\n");
fprintf(stderr, " -r f : speech speed rate [ 1.0][ 0.0-- ]\n");
fprintf(stderr, " -fm f : additional half-tone [ 0.0][ -- ]\n");
fprintf(stderr, " -u f : voiced/unvoiced threshold [ 0.5][ 0.0-- 1.0]\n");
fprintf(stderr, " -jm f : weight of GV for spectrum [ 1.0][ 0.0-- ]\n");
fprintf(stderr, " -jf f : weight of GV for log F0 [ 1.0][ 0.0-- ]\n");
fprintf(stderr, " -g f : volume (dB) [ 0.0][ -- ]\n");
fprintf(stderr, " -z i : audio buffer size (if i==0, turn off) [ 0][ 0-- ]\n");
fprintf(stderr, " infile:\n");
fprintf(stderr, " label file\n");
fprintf(stderr, " note:\n");
fprintf(stderr, " generated spectrum, log F0, and low-pass filter coefficient\n");
fprintf(stderr, " sequences are saved in natural endian, binary (float) format.\n");
fprintf(stderr, "\n");
exit(0);
}
int main(int argc, char **argv)
{
int i;
double f;
/* hts_engine API */
HTS_Engine engine;
/* HTS voices */
size_t num_voices;
char **fn_voices;
/* input label file name */
char *labfn = NULL;
/* output file pointers */
FILE *durfp = NULL, *mgcfp = NULL, *lf0fp = NULL, *lpffp = NULL, *wavfp = NULL, *rawfp = NULL, *tracefp = NULL;
/* interpolation weights */
size_t num_interpolation_weights;
/* output usage */
if (argc <= 1)
usage();
/* initialize hts_engine API */
HTS_Engine_initialize(&engine);
/* get HTS voice file names */
num_voices = 0;
fn_voices = (char **) malloc(argc * sizeof(char *));
for (i = 0; i < argc; i++) {
if (argv[i][0] == '-' && argv[i][1] == 'm')
fn_voices[num_voices++] = argv[++i];
if (argv[i][0] == '-' && argv[i][1] == 'h')
usage();
}
if (num_voices == 0) {
fprintf(stderr, "Error: HTS voice must be specified.\n");
free(fn_voices);
exit(1);
}
/* load HTS voices */
if (HTS_Engine_load(&engine, fn_voices, num_voices) != TRUE) {
fprintf(stderr, "Error: HTS voices cannot be loaded.\n");
free(fn_voices);
HTS_Engine_clear(&engine);
exit(1);
}
free(fn_voices);
/* get options */
while (--argc) {
if (**++argv == '-') {
switch (*(*argv + 1)) {
case 'v':
switch (*(*argv + 2)) {
case 'p':
HTS_Engine_set_phoneme_alignment_flag(&engine, TRUE);
break;
default:
fprintf(stderr, "Error: Invalid option '-v%c'.\n", *(*argv + 2));
HTS_Engine_clear(&engine);
exit(1);
}
break;
case 'o':
switch (*(*argv + 2)) {
case 'w':
wavfp = fopen(*++argv, "wb");
break;
case 'r':
rawfp = fopen(*++argv, "wb");
break;
case 'd':
durfp = fopen(*++argv, "wt");
break;
case 'm':
mgcfp = fopen(*++argv, "wb");
break;
case 'f':
case 'p':
lf0fp = fopen(*++argv, "wb");
break;
case 'l':
lpffp = fopen(*++argv, "wb");
break;
case 't':
tracefp = fopen(*++argv, "wt");
break;
default:
fprintf(stderr, "Error: Invalid option '-o%c'.\n", *(*argv + 2));
HTS_Engine_clear(&engine);
exit(1);
}
--argc;
break;
case 'h':
usage();
break;
case 'm':
argv++; /* HTS voices were already loaded */
--argc;
break;
case 's':
HTS_Engine_set_sampling_frequency(&engine, (size_t) atoi(*++argv));
--argc;
break;
case 'p':
HTS_Engine_set_fperiod(&engine, (size_t) atoi(*++argv));
--argc;
break;
case 'a':
HTS_Engine_set_alpha(&engine, atof(*++argv));
--argc;
break;
case 'b':
HTS_Engine_set_beta(&engine, atof(*++argv));
--argc;
break;
case 'r':
HTS_Engine_set_speed(&engine, atof(*++argv));
--argc;
break;
case 'f':
switch (*(*argv + 2)) {
case 'm':
HTS_Engine_add_half_tone(&engine, atof(*++argv));
break;
default:
fprintf(stderr, "Error: Invalid option '-f%c'.\n", *(*argv + 2));
HTS_Engine_clear(&engine);
exit(1);
}
--argc;
break;
case 'u':
HTS_Engine_set_msd_threshold(&engine, 1, atof(*++argv));
--argc;
break;
case 'i':
num_interpolation_weights = atoi(*++argv);
argc--;
if (num_interpolation_weights != num_voices) {
HTS_Engine_clear(&engine);
exit(1);
}
for (i = 0; i < num_interpolation_weights; i++) {
f = atof(*++argv);
argc--;
HTS_Engine_set_duration_interpolation_weight(&engine, i, f);
HTS_Engine_set_parameter_interpolation_weight(&engine, i, 0, f);
HTS_Engine_set_parameter_interpolation_weight(&engine, i, 1, f);
HTS_Engine_set_gv_interpolation_weight(&engine, i, 0, f);
HTS_Engine_set_gv_interpolation_weight(&engine, i, 1, f);
}
break;
case 'j':
switch (*(*argv + 2)) {
case 'm':
HTS_Engine_set_gv_weight(&engine, 0, atof(*++argv));
break;
case 'f':
case 'p':
HTS_Engine_set_gv_weight(&engine, 1, atof(*++argv));
break;
default:
fprintf(stderr, "Error: Invalid option '-j%c'.\n", *(*argv + 2));
HTS_Engine_clear(&engine);
exit(1);
}
--argc;
break;
case 'g':
HTS_Engine_set_volume(&engine, atof(*++argv));
--argc;
break;
case 'z':
HTS_Engine_set_audio_buff_size(&engine, (size_t) atoi(*++argv));
--argc;
break;
default:
fprintf(stderr, "Error: Invalid option '-%c'.\n", *(*argv + 1));
HTS_Engine_clear(&engine);
exit(1);
}
} else {
labfn = *argv;
}
}
/* synthesize */
if (HTS_Engine_synthesize_from_fn(&engine, labfn) != TRUE) {
fprintf(stderr, "Error: waveform cannot be synthesized.\n");
HTS_Engine_clear(&engine);
exit(1);
}
/* output */
if (tracefp != NULL)
HTS_Engine_save_information(&engine, tracefp);
if (durfp != NULL)
HTS_Engine_save_label(&engine, durfp);
if (rawfp)
HTS_Engine_save_generated_speech(&engine, rawfp);
if (wavfp)
HTS_Engine_save_riff(&engine, wavfp);
if (mgcfp)
HTS_Engine_save_generated_parameter(&engine, 0, mgcfp);
if (lf0fp)
HTS_Engine_save_generated_parameter(&engine, 1, lf0fp);
if (lpffp)
HTS_Engine_save_generated_parameter(&engine, 2, lpffp);
/* reset */
HTS_Engine_refresh(&engine);
/* free memory */
HTS_Engine_clear(&engine);
/* close files */
if (durfp != NULL)
fclose(durfp);
if (mgcfp != NULL)
fclose(mgcfp);
if (lf0fp != NULL)
fclose(lf0fp);
if (lpffp != NULL)
fclose(lpffp);
if (wavfp != NULL)
fclose(wavfp);
if (rawfp != NULL)
fclose(rawfp);
if (tracefp != NULL)
fclose(tracefp);
return 0;
}
HTS_ENGINE_C_END;
#endif /* !HTS_ENGINE_C */