/src/espeak-ng/src/libespeak-ng/ssml.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) 2005 to 2015 by Jonathan Duddington |
3 | | * email: jonsd@users.sourceforge.net |
4 | | * Copyright (C) 2015-2017 Reece H. Dunn |
5 | | * Copyright (C) 2018 Juho Hiltunen |
6 | | * |
7 | | * This program is free software; you can redistribute it and/or modify |
8 | | * it under the terms of the GNU General Public License as published by |
9 | | * the Free Software Foundation; either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * This program is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU General Public License |
18 | | * along with this program; if not, see: <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | |
22 | | #include "config.h" |
23 | | |
24 | | #include <ctype.h> |
25 | | #include <errno.h> |
26 | | #include <locale.h> |
27 | | #include <math.h> |
28 | | #include <stdint.h> |
29 | | #include <stdio.h> |
30 | | #include <stdlib.h> |
31 | | #include <string.h> |
32 | | #include <unistd.h> |
33 | | #include <wchar.h> |
34 | | #include <wctype.h> |
35 | | |
36 | | |
37 | | #include <espeak-ng/espeak_ng.h> |
38 | | #include <espeak-ng/speak_lib.h> |
39 | | #include <espeak-ng/encoding.h> |
40 | | #include <ucd/ucd.h> |
41 | | |
42 | | #include "ssml.h" |
43 | | #include "common.h" // for strncpy0 |
44 | | #include "mnemonics.h" // for LookupMnemName, MNEM_TAB, |
45 | | #include "readclause.h" // for PARAM_STACK, param_stack, AddNameData |
46 | | #include "soundicon.h" // for LoadSoundFile2 |
47 | | #include "synthesize.h" // for SPEED_FACTORS, speed |
48 | | #include "translate.h" // for CTRL_EMBEDDED |
49 | | #include "voice.h" // for SelectVoice, SelectVoiceByName |
50 | | #include "speech.h" // for MAKE_MEM_UNDEFINED |
51 | | |
52 | | static const MNEM_TAB ssmltags[] = { |
53 | | { "speak", SSML_SPEAK }, |
54 | | { "voice", SSML_VOICE }, |
55 | | { "prosody", SSML_PROSODY }, |
56 | | { "say-as", SSML_SAYAS }, |
57 | | { "mark", SSML_MARK }, |
58 | | { "s", SSML_SENTENCE }, |
59 | | { "p", SSML_PARAGRAPH }, |
60 | | { "phoneme", SSML_PHONEME }, |
61 | | { "sub", SSML_SUB }, |
62 | | { "tts:style", SSML_STYLE }, |
63 | | { "audio", SSML_AUDIO }, |
64 | | { "emphasis", SSML_EMPHASIS }, |
65 | | { "break", SSML_BREAK }, |
66 | | { "metadata", SSML_IGNORE_TEXT }, |
67 | | |
68 | | { "br", HTML_BREAK }, |
69 | | { "li", HTML_BREAK }, |
70 | | { "dd", HTML_BREAK }, |
71 | | { "img", HTML_BREAK }, |
72 | | { "td", HTML_BREAK }, |
73 | | { "h1", SSML_PARAGRAPH }, |
74 | | { "h2", SSML_PARAGRAPH }, |
75 | | { "h3", SSML_PARAGRAPH }, |
76 | | { "h4", SSML_PARAGRAPH }, |
77 | | { "hr", SSML_PARAGRAPH }, |
78 | | { "script", SSML_IGNORE_TEXT }, |
79 | | { "style", SSML_IGNORE_TEXT }, |
80 | | { "font", HTML_NOSPACE }, |
81 | | { "b", HTML_NOSPACE }, |
82 | | { "i", HTML_NOSPACE }, |
83 | | { "strong", HTML_NOSPACE }, |
84 | | { "em", HTML_NOSPACE }, |
85 | | { "code", HTML_NOSPACE }, |
86 | | |
87 | | { NULL, 0 } |
88 | | }; |
89 | | |
90 | | static int (*uri_callback)(int, const char *, const char *) = NULL; |
91 | | |
92 | | static int attrcmp(const wchar_t *string1, const char *string2) |
93 | 48.0k | { |
94 | 48.0k | int ix; |
95 | | |
96 | 48.0k | if (string1 == NULL) |
97 | 48.0k | return 1; |
98 | | |
99 | 0 | for (ix = 0; (string1[ix] == string2[ix]) && (string1[ix] != 0); ix++) |
100 | 0 | ; |
101 | 0 | if (((string1[ix] == '"') || (string1[ix] == '\'')) && (string2[ix] == 0)) |
102 | 0 | return 0; |
103 | 0 | return 1; |
104 | 0 | } |
105 | | |
106 | | |
107 | | static int attrlookup(const wchar_t *string1, const MNEM_TAB *mtab) |
108 | 16.0k | { |
109 | 16.0k | int ix; |
110 | | |
111 | 63.9k | for (ix = 0; mtab[ix].mnem != NULL; ix++) { |
112 | 47.8k | if (attrcmp(string1, mtab[ix].mnem) == 0) |
113 | 0 | return mtab[ix].value; |
114 | 47.8k | } |
115 | 16.0k | return mtab[ix].value; |
116 | 16.0k | } |
117 | | |
118 | | static int attrnumber(const wchar_t *pw, int default_value, int type) |
119 | 31.0k | { |
120 | 31.0k | int value = 0; |
121 | | |
122 | 31.0k | if ((pw == NULL) || !IsDigit09(*pw)) |
123 | 31.0k | return default_value; |
124 | | |
125 | 0 | while (IsDigit09(*pw)) |
126 | 0 | value = value*10 + *pw++ - '0'; |
127 | 0 | if ((type == 1) && (ucd_tolower(*pw) == 's')) { |
128 | | // time: seconds rather than ms |
129 | 0 | value *= 1000; |
130 | 0 | } |
131 | 0 | return value; |
132 | 31.0k | } |
133 | | |
134 | | static int attrcopy_utf8(char *buf, const wchar_t *pw, int len) |
135 | 30.7k | { |
136 | | // Convert attribute string into utf8, write to buf, and return its utf8 length |
137 | 30.7k | int ix = 0; |
138 | | |
139 | 30.7k | if (pw != NULL) { |
140 | 11.4k | wchar_t quote = pw[-1]; |
141 | 11.4k | if ((quote != '"') && (quote != '\'')) quote = 0; |
142 | | |
143 | 11.4k | unsigned int c; |
144 | 11.4k | int prev_c = 0; |
145 | 57.5k | while ((ix < (len-4)) && ((c = *pw++) != 0)) { |
146 | 50.0k | if ((quote == 0) && (isspace(c) || (c == '/'))) |
147 | 1.84k | break; |
148 | 48.1k | if ((quote != 0) && (c == quote) && (prev_c != '\\')) |
149 | 2.06k | break; // " indicates end of attribute, unless preceded by backstroke |
150 | | |
151 | 46.1k | int n = utf8_out(c, &buf[ix]); |
152 | 46.1k | ix += n; |
153 | 46.1k | prev_c = c; |
154 | 46.1k | } |
155 | 11.4k | } |
156 | 30.7k | buf[ix] = 0; |
157 | 30.7k | return ix; |
158 | 30.7k | } |
159 | | |
160 | | static int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out) |
161 | 0 | { |
162 | 0 | int sign = 0; |
163 | 0 | wchar_t *tail; |
164 | 0 | double value; |
165 | |
|
166 | 0 | while (iswspace(*pw)) pw++; |
167 | 0 | if (*pw == '+') { |
168 | 0 | pw++; |
169 | 0 | sign = 1; |
170 | 0 | } |
171 | 0 | if (*pw == '-') { |
172 | 0 | pw++; |
173 | 0 | sign = -1; |
174 | 0 | } |
175 | 0 | value = (double)wcstod(pw, &tail); |
176 | 0 | if (tail == pw) { |
177 | | // failed to find a number, return 100% |
178 | 0 | *value_out = 100; |
179 | 0 | return 2; |
180 | 0 | } |
181 | | |
182 | 0 | if (*tail == '%') { |
183 | 0 | if (sign != 0) |
184 | 0 | value = 100 + (sign * value); |
185 | 0 | *value_out = (int)value; |
186 | 0 | return 2; // percentage |
187 | 0 | } |
188 | | |
189 | 0 | if ((tail[0] == 's') && (tail[1] == 't')) { |
190 | 0 | double x; |
191 | | // convert from semitones to a frequency percentage |
192 | 0 | x = pow((double)2.0, (double)((value*sign)/12)) * 100; |
193 | 0 | *value_out = (int)x; |
194 | 0 | return 2; // percentage |
195 | 0 | } |
196 | | |
197 | 0 | if (param_type == espeakRATE) { |
198 | 0 | if (sign == 0) |
199 | 0 | *value_out = (int)(value * 100); |
200 | 0 | else |
201 | 0 | *value_out = 100 + (int)(sign * value * 100); |
202 | 0 | return 2; // percentage |
203 | 0 | } |
204 | | |
205 | 0 | *value_out = (int)value; |
206 | 0 | return sign; // -1, 0, or 1 |
207 | 0 | } |
208 | | |
209 | | static const char *VoiceFromStack(SSML_STACK *ssml_stack, int n_ssml_stack, espeak_VOICE *base_voice, char base_voice_variant_name[40]) |
210 | 15.7k | { |
211 | | // Use the voice properties from the SSML stack to choose a voice, and switch |
212 | | // to that voice if it's not the current voice |
213 | | |
214 | 15.7k | int ix; |
215 | 15.7k | const char *p; |
216 | 15.7k | SSML_STACK *sp; |
217 | 15.7k | const char *v_id; |
218 | 15.7k | int voice_found; |
219 | 15.7k | espeak_VOICE voice_select; |
220 | 15.7k | static char voice_name[40]; |
221 | 15.7k | static char identifier[40]; |
222 | 15.7k | char language[40]; |
223 | | |
224 | 15.7k | MAKE_MEM_UNDEFINED(&voice_name, sizeof(voice_name)); |
225 | | |
226 | 15.7k | strcpy(voice_name, ssml_stack[0].voice_name); |
227 | 15.7k | strcpy(language, ssml_stack[0].language); |
228 | 15.7k | voice_select.age = ssml_stack[0].voice_age; |
229 | 15.7k | voice_select.gender = ssml_stack[0].voice_gender; |
230 | 15.7k | voice_select.variant = ssml_stack[0].voice_variant_number; |
231 | 15.7k | voice_select.identifier = NULL; |
232 | | |
233 | 46.9k | for (ix = 0; ix < n_ssml_stack; ix++) { |
234 | 31.1k | espeak_VOICE *v; |
235 | 31.1k | sp = &ssml_stack[ix]; |
236 | 31.1k | int voice_name_specified = 0; |
237 | | |
238 | 31.1k | if ((sp->voice_name[0] != 0) && ((v = SelectVoiceByName(NULL, sp->voice_name)) != NULL)) { |
239 | 15.7k | voice_name_specified = 1; |
240 | 15.7k | strcpy(voice_name, sp->voice_name); |
241 | 15.7k | strcpy(identifier, v->identifier); |
242 | 15.7k | language[0] = 0; |
243 | 15.7k | voice_select.gender = ENGENDER_UNKNOWN; |
244 | 15.7k | voice_select.age = 0; |
245 | 15.7k | voice_select.variant = 0; |
246 | 15.7k | } |
247 | 31.1k | if (sp->language[0] != 0) { |
248 | 27.1k | strcpy(language, sp->language); |
249 | | |
250 | | // is this language provided by the base voice? |
251 | 27.1k | p = base_voice->languages; |
252 | 59.3k | while (*p++ != 0) { |
253 | 40.8k | if (strcmp(p, language) == 0) { |
254 | | // yes, change the language to the main language of the base voice |
255 | 8.66k | strcpy(language, &base_voice->languages[1]); |
256 | 8.66k | break; |
257 | 8.66k | } |
258 | 32.1k | p += (strlen(p) + 1); |
259 | 32.1k | } |
260 | | |
261 | 27.1k | if (voice_name_specified == 0) |
262 | 11.4k | { |
263 | 11.4k | voice_name[0] = 0; // forget a previous voice name if a language is specified |
264 | 11.4k | identifier[0] = 0; |
265 | 11.4k | } |
266 | 27.1k | } |
267 | 31.1k | if (sp->voice_gender != ENGENDER_UNKNOWN) |
268 | 0 | voice_select.gender = sp->voice_gender; |
269 | | |
270 | 31.1k | if (sp->voice_age != 0) |
271 | 0 | voice_select.age = sp->voice_age; |
272 | 31.1k | if (sp->voice_variant_number != 0) |
273 | 0 | voice_select.variant = sp->voice_variant_number; |
274 | 31.1k | } |
275 | | |
276 | 15.7k | voice_select.name = voice_name; |
277 | 15.7k | voice_select.identifier = identifier; |
278 | 15.7k | voice_select.languages = language; |
279 | | |
280 | 15.7k | v_id = SelectVoice(&voice_select, &voice_found); |
281 | 15.7k | if (v_id == NULL) |
282 | 0 | return "default"; |
283 | | |
284 | 15.7k | if ((strchr(v_id, '+') == NULL) && ((voice_select.gender == ENGENDER_UNKNOWN) || (voice_select.gender == base_voice->gender)) && (base_voice_variant_name[0] != 0)) { |
285 | | // a voice variant has not been selected, use the original voice variant |
286 | 0 | char buf[80]; |
287 | 0 | sprintf(buf, "%s+%s", v_id, base_voice_variant_name); |
288 | 0 | strncpy0(voice_name, buf, sizeof(voice_name)); |
289 | 0 | return voice_name; |
290 | 0 | } |
291 | 15.7k | return v_id; |
292 | 15.7k | } |
293 | | |
294 | | |
295 | | static const wchar_t *GetSsmlAttribute(wchar_t *pw, const char *name) |
296 | 45.4k | { |
297 | | // Gets the value string for an attribute. |
298 | | // Returns NULL if the attribute is not present |
299 | | |
300 | 45.4k | int ix; |
301 | 45.4k | static const wchar_t empty[1] = { 0 }; |
302 | | |
303 | 83.3k | while (*pw != 0) { |
304 | 49.3k | if (iswspace(pw[-1])) { |
305 | 15.4k | ix = 0; |
306 | 112k | while (*pw == name[ix]) { |
307 | 96.6k | pw++; |
308 | 96.6k | ix++; |
309 | 96.6k | } |
310 | 15.4k | if (name[ix] == 0) { |
311 | | // found the attribute, now get the value |
312 | 11.9k | while (iswspace(*pw)) pw++; |
313 | 11.4k | if (*pw == '=') pw++; |
314 | 12.0k | while (iswspace(*pw)) pw++; |
315 | 11.4k | if ((*pw == '"') || (*pw == '\'')) // allow single-quotes ? |
316 | 6.61k | return pw+1; |
317 | 4.81k | else if (iswspace(*pw) || (*pw == '/')) // end of attribute |
318 | 0 | return empty; |
319 | 4.81k | else |
320 | 4.81k | return pw; |
321 | 11.4k | } |
322 | 15.4k | } |
323 | 37.9k | pw++; |
324 | 37.9k | } |
325 | 33.9k | return NULL; |
326 | 45.4k | } |
327 | | |
328 | | |
329 | | static int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char *base_voice_variant_name) |
330 | 18.8k | { |
331 | | // Determines whether voice attribute are specified in this tag, and if so, whether this means |
332 | | // a voice change. |
333 | | // If it's a closing tag, delete the top frame of the stack and determine whether this implies |
334 | | // a voice change. |
335 | | // Returns CLAUSE_TYPE_VOICE_CHANGE if there is a voice change |
336 | | |
337 | 18.8k | const char *new_voice_id; |
338 | | |
339 | 18.8k | static const MNEM_TAB mnem_gender[] = { |
340 | 18.8k | { "male", ENGENDER_MALE }, |
341 | 18.8k | { "female", ENGENDER_FEMALE }, |
342 | 18.8k | { "neutral", ENGENDER_NEUTRAL }, |
343 | 18.8k | { NULL, ENGENDER_UNKNOWN } |
344 | 18.8k | }; |
345 | | |
346 | 18.8k | if (tag_type & SSML_CLOSE) { |
347 | | // delete a stack frame |
348 | 365 | if (n_ssml_stack > 1) |
349 | 0 | n_ssml_stack--; |
350 | 18.5k | } else { |
351 | 18.5k | const wchar_t *lang; |
352 | 18.5k | const wchar_t *gender; |
353 | 18.5k | const wchar_t *name; |
354 | 18.5k | const wchar_t *age; |
355 | 18.5k | const wchar_t *variant; |
356 | | |
357 | | // add a stack frame if any voice details are specified |
358 | 18.5k | lang = GetSsmlAttribute(pw, "xml:lang"); |
359 | | |
360 | 18.5k | if (tag_type != SSML_VOICE) { |
361 | | // only expect an xml:lang attribute |
362 | 14.5k | name = NULL; |
363 | 14.5k | variant = NULL; |
364 | 14.5k | age = NULL; |
365 | 14.5k | gender = NULL; |
366 | 14.5k | } else { |
367 | 3.96k | name = GetSsmlAttribute(pw, "name"); |
368 | 3.96k | variant = GetSsmlAttribute(pw, "variant"); |
369 | 3.96k | age = GetSsmlAttribute(pw, "age"); |
370 | 3.96k | gender = GetSsmlAttribute(pw, "gender"); |
371 | 3.96k | } |
372 | | |
373 | 18.5k | if ((tag_type != SSML_VOICE) && (lang == NULL)) |
374 | 3.12k | return 0; // <s> or <p> without language spec, nothing to do |
375 | | |
376 | 15.3k | ssml_sp = &ssml_stack[n_ssml_stack++]; |
377 | | |
378 | 15.3k | int value; |
379 | | |
380 | 15.3k | attrcopy_utf8(ssml_sp->language, lang, sizeof(ssml_sp->language)); |
381 | 15.3k | attrcopy_utf8(ssml_sp->voice_name, name, sizeof(ssml_sp->voice_name)); |
382 | 15.3k | if ((value = attrnumber(variant, 1, 0)) > 0) |
383 | 15.3k | value--; // variant='0' and variant='1' the same |
384 | 15.3k | ssml_sp->voice_variant_number = value; |
385 | 15.3k | ssml_sp->voice_age = attrnumber(age, 0, 0); |
386 | 15.3k | ssml_sp->voice_gender = attrlookup(gender, mnem_gender); |
387 | 15.3k | ssml_sp->tag_type = tag_type; |
388 | 15.3k | } |
389 | | |
390 | 15.7k | new_voice_id = VoiceFromStack(ssml_stack, n_ssml_stack, base_voice, base_voice_variant_name); |
391 | 15.7k | if (strcmp(new_voice_id, current_voice_id) != 0) { |
392 | | // add an embedded command to change the voice |
393 | 9.01k | strcpy(current_voice_id, new_voice_id); |
394 | 9.01k | return CLAUSE_TYPE_VOICE_CHANGE; |
395 | 9.01k | } |
396 | | |
397 | 6.74k | return 0; |
398 | 15.7k | } |
399 | | |
400 | | static void ProcessParamStack(char *outbuf, int *outix, int n_param_stack, PARAM_STACK *param_stack, int *speech_parameters) |
401 | 7.96k | { |
402 | | // Set the speech parameters from the parameter stack |
403 | 7.96k | int param; |
404 | 7.96k | int ix; |
405 | 7.96k | char buf[20]; |
406 | 7.96k | int new_parameters[N_SPEECH_PARAM]; |
407 | 7.96k | static const char cmd_letter[N_SPEECH_PARAM] = { 0, 'S', 'A', 'P', 'R', 0, 'C', 0, 0, 0, 0, 0, 'F' }; // embedded command letters |
408 | | |
409 | 127k | for (param = 0; param < N_SPEECH_PARAM; param++) |
410 | 119k | new_parameters[param] = -1; |
411 | | |
412 | 87.0k | for (ix = 0; ix < n_param_stack; ix++) { |
413 | 1.26M | for (param = 0; param < N_SPEECH_PARAM; param++) { |
414 | 1.18M | if (param_stack[ix].parameter[param] >= 0) |
415 | 205k | new_parameters[param] = param_stack[ix].parameter[param]; |
416 | 1.18M | } |
417 | 79.1k | } |
418 | | |
419 | 127k | for (param = 0; param < N_SPEECH_PARAM; param++) { |
420 | 119k | int value; |
421 | 119k | if ((value = new_parameters[param]) != speech_parameters[param]) { |
422 | 513 | buf[0] = 0; |
423 | | |
424 | 513 | switch (param) |
425 | 513 | { |
426 | 0 | case espeakPUNCTUATION: |
427 | 0 | option_punctuation = value-1; |
428 | 0 | break; |
429 | 0 | case espeakCAPITALS: |
430 | 0 | option_capitals = value; |
431 | 0 | break; |
432 | 0 | case espeakRATE: |
433 | 3 | case espeakVOLUME: |
434 | 3 | case espeakPITCH: |
435 | 6 | case espeakRANGE: |
436 | 513 | case espeakEMPHASIS: |
437 | 513 | sprintf(buf, "%c%d%c", CTRL_EMBEDDED, value, cmd_letter[param]); |
438 | 513 | break; |
439 | 513 | } |
440 | | |
441 | 513 | speech_parameters[param] = new_parameters[param]; |
442 | 513 | strcpy(&outbuf[*outix], buf); |
443 | 513 | *outix += strlen(buf); |
444 | 513 | } |
445 | 119k | } |
446 | 7.96k | } |
447 | | |
448 | | static PARAM_STACK *PushParamStack(int tag_type, int *n_param_stack, PARAM_STACK *param_stack) |
449 | 5.49k | { |
450 | 5.49k | int ix; |
451 | 5.49k | PARAM_STACK *sp; |
452 | | |
453 | 5.49k | sp = ¶m_stack[*n_param_stack]; |
454 | 5.49k | if (*n_param_stack < (N_PARAM_STACK-1)) |
455 | 2.58k | (*n_param_stack)++; |
456 | | |
457 | 5.49k | sp->type = tag_type; |
458 | 87.9k | for (ix = 0; ix < N_SPEECH_PARAM; ix++) |
459 | 82.4k | sp->parameter[ix] = -1; |
460 | 5.49k | return sp; |
461 | 5.49k | } |
462 | | |
463 | | static void PopParamStack(int tag_type, char *outbuf, int *outix, int *n_param_stack, PARAM_STACK *param_stack, int *speech_parameters) |
464 | 2.47k | { |
465 | | // unwind the stack up to and including the previous tag of this type |
466 | 2.47k | int ix; |
467 | 2.47k | int top = 0; |
468 | | |
469 | 2.47k | if (tag_type >= SSML_CLOSE) |
470 | 1.77k | tag_type -= SSML_CLOSE; |
471 | | |
472 | 10.9k | for (ix = 0; ix < *n_param_stack; ix++) { |
473 | 8.44k | if (param_stack[ix].type == tag_type) |
474 | 5.55k | top = ix; |
475 | 8.44k | } |
476 | 2.47k | if (top > 0) |
477 | 1.57k | *n_param_stack = top; |
478 | 2.47k | ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters); |
479 | 2.47k | } |
480 | | |
481 | | static int ReplaceKeyName(char *outbuf, int index, int *outix) |
482 | 0 | { |
483 | | // Replace some key-names by single characters, so they can be pronounced in different languages |
484 | 0 | static const MNEM_TAB keynames[] = { |
485 | 0 | { "space ", 0xe020 }, |
486 | 0 | { "tab ", 0xe009 }, |
487 | 0 | { "underscore ", 0xe05f }, |
488 | 0 | { "double-quote ", '"' }, |
489 | 0 | { NULL, 0 } |
490 | 0 | }; |
491 | |
|
492 | 0 | int letter; |
493 | 0 | char *p; |
494 | |
|
495 | 0 | p = &outbuf[index]; |
496 | |
|
497 | 0 | if ((letter = LookupMnem(keynames, p)) != 0) { |
498 | 0 | int ix; |
499 | 0 | ix = utf8_out(letter, p); |
500 | 0 | *outix = index + ix; |
501 | 0 | return letter; |
502 | 0 | } |
503 | 0 | return 0; |
504 | 0 | } |
505 | | |
506 | | static void SetProsodyParameter(int param_type, const wchar_t *attr1, PARAM_STACK *sp, PARAM_STACK *param_stack, int *speech_parameters) |
507 | 0 | { |
508 | 0 | int value; |
509 | | |
510 | |
|
511 | 0 | static const MNEM_TAB mnem_volume[] = { |
512 | 0 | { "default", 100 }, |
513 | 0 | { "silent", 0 }, |
514 | 0 | { "x-soft", 30 }, |
515 | 0 | { "soft", 65 }, |
516 | 0 | { "medium", 100 }, |
517 | 0 | { "loud", 150 }, |
518 | 0 | { "x-loud", 230 }, |
519 | 0 | { NULL, -1 } |
520 | 0 | }; |
521 | |
|
522 | 0 | static const MNEM_TAB mnem_rate[] = { |
523 | 0 | { "default", 100 }, |
524 | 0 | { "x-slow", 60 }, |
525 | 0 | { "slow", 80 }, |
526 | 0 | { "medium", 100 }, |
527 | 0 | { "fast", 125 }, |
528 | 0 | { "x-fast", 160 }, |
529 | 0 | { NULL, -1 } |
530 | 0 | }; |
531 | |
|
532 | 0 | static const MNEM_TAB mnem_pitch[] = { |
533 | 0 | { "default", 100 }, |
534 | 0 | { "x-low", 70 }, |
535 | 0 | { "low", 85 }, |
536 | 0 | { "medium", 100 }, |
537 | 0 | { "high", 110 }, |
538 | 0 | { "x-high", 120 }, |
539 | 0 | { NULL, -1 } |
540 | 0 | }; |
541 | |
|
542 | 0 | static const MNEM_TAB mnem_range[] = { |
543 | 0 | { "default", 100 }, |
544 | 0 | { "x-low", 20 }, |
545 | 0 | { "low", 50 }, |
546 | 0 | { "medium", 100 }, |
547 | 0 | { "high", 140 }, |
548 | 0 | { "x-high", 180 }, |
549 | 0 | { NULL, -1 } |
550 | 0 | }; |
551 | |
|
552 | 0 | static const MNEM_TAB * const mnem_tabs[5] = { |
553 | 0 | NULL, mnem_rate, mnem_volume, mnem_pitch, mnem_range |
554 | 0 | }; |
555 | |
|
556 | 0 | if ((value = attrlookup(attr1, mnem_tabs[param_type])) >= 0) { |
557 | | // mnemonic specifies a value as a percentage of the base pitch/range/rate/volume |
558 | 0 | sp->parameter[param_type] = (param_stack[0].parameter[param_type] * value)/100; |
559 | 0 | } else { |
560 | 0 | int sign = attr_prosody_value(param_type, attr1, &value); |
561 | |
|
562 | 0 | if (sign == 0) |
563 | 0 | sp->parameter[param_type] = value; // absolute value in Hz |
564 | 0 | else if (sign == 2) { |
565 | | // change specified as percentage or in semitones |
566 | 0 | sp->parameter[param_type] = (speech_parameters[param_type] * value)/100; |
567 | 0 | } else { |
568 | | // change specified as plus or minus Hz |
569 | 0 | sp->parameter[param_type] = speech_parameters[param_type] + (value*sign); |
570 | 0 | } |
571 | 0 | } |
572 | 0 | } |
573 | | |
574 | | int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters) |
575 | 207k | { |
576 | | // xml_buf is the tag and attributes with a zero terminator in place of the original '>' |
577 | | // returns a clause terminator value. |
578 | | |
579 | 207k | unsigned int ix; |
580 | 207k | int index; |
581 | 207k | int tag_type; |
582 | 207k | int value; |
583 | 207k | int value2; |
584 | 207k | int value3; |
585 | 207k | int voice_change_flag; |
586 | 207k | wchar_t *px; |
587 | 207k | const wchar_t *attr1; |
588 | 207k | const wchar_t *attr2; |
589 | 207k | const wchar_t *attr3; |
590 | 207k | int terminator; |
591 | 207k | int param_type; |
592 | 207k | char tag_name[40]; |
593 | 207k | char buf[160]; |
594 | 207k | PARAM_STACK *sp; |
595 | 207k | SSML_STACK *ssml_sp; |
596 | | |
597 | | // don't process comments and xml declarations |
598 | 207k | if (wcsncmp(xml_buf, (wchar_t *) "!--", 3) == 0 || wcsncmp(xml_buf, (wchar_t *) "?xml", 4) == 0) { |
599 | 0 | return 0; |
600 | 0 | } |
601 | | |
602 | | // these tags have no effect if they are self-closing, eg. <voice /> |
603 | 207k | static const char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 }; |
604 | | |
605 | 207k | bool self_closing = false; |
606 | 207k | int len; |
607 | 207k | len = wcslen(xml_buf); |
608 | 207k | if (xml_buf[len - 1] == '/') { |
609 | | // a self-closing tag |
610 | 1.43k | xml_buf[len - 1] = ' '; |
611 | 1.43k | self_closing = true; |
612 | 1.43k | } |
613 | | |
614 | 207k | static const MNEM_TAB mnem_phoneme_alphabet[] = { |
615 | 207k | { "espeak", 1 }, |
616 | 207k | { NULL, -1 } |
617 | 207k | }; |
618 | | |
619 | 207k | static const MNEM_TAB mnem_punct[] = { |
620 | 207k | { "none", 1 }, |
621 | 207k | { "all", 2 }, |
622 | 207k | { "some", 3 }, |
623 | 207k | { NULL, -1 } |
624 | 207k | }; |
625 | | |
626 | 207k | static const MNEM_TAB mnem_capitals[] = { |
627 | 207k | { "no", 0 }, |
628 | 207k | { "icon", 1 }, |
629 | 207k | { "spelling", 2 }, |
630 | 207k | { "pitch", 20 }, // this is the amount by which to raise the pitch |
631 | 207k | { NULL, -1 } |
632 | 207k | }; |
633 | | |
634 | 207k | static const MNEM_TAB mnem_interpret_as[] = { |
635 | 207k | { "characters", SAYAS_CHARS }, |
636 | 207k | { "tts:char", SAYAS_SINGLE_CHARS }, |
637 | 207k | { "tts:key", SAYAS_KEY }, |
638 | 207k | { "tts:digits", SAYAS_DIGITS }, |
639 | 207k | { "telephone", SAYAS_DIGITS1 }, |
640 | 207k | { NULL, -1 } |
641 | 207k | }; |
642 | | |
643 | 207k | static const MNEM_TAB mnem_sayas_format[] = { |
644 | 207k | { "glyphs", 1 }, |
645 | 207k | { NULL, -1 } |
646 | 207k | }; |
647 | | |
648 | 207k | static const MNEM_TAB mnem_break[] = { |
649 | 207k | { "none", 0 }, |
650 | 207k | { "x-weak", 1 }, |
651 | 207k | { "weak", 2 }, |
652 | 207k | { "medium", 3 }, |
653 | 207k | { "strong", 4 }, |
654 | 207k | { "x-strong", 5 }, |
655 | 207k | { NULL, -1 } |
656 | 207k | }; |
657 | | |
658 | 207k | static const MNEM_TAB mnem_emphasis[] = { |
659 | 207k | { "none", 1 }, |
660 | 207k | { "reduced", 2 }, |
661 | 207k | { "moderate", 3 }, |
662 | 207k | { "strong", 4 }, |
663 | 207k | { "x-strong", 5 }, |
664 | 207k | { NULL, -1 } |
665 | 207k | }; |
666 | | |
667 | 207k | static const char * const prosody_attr[5] = { |
668 | 207k | NULL, "rate", "volume", "pitch", "range" |
669 | 207k | }; |
670 | | |
671 | 589k | for (ix = 0; ix < (sizeof(tag_name)-1); ix++) { |
672 | 589k | int c; |
673 | 589k | if (((c = xml_buf[ix]) == 0) || iswspace(c)) |
674 | 207k | break; |
675 | 381k | tag_name[ix] = tolower((char)c); |
676 | 381k | } |
677 | 207k | tag_name[ix] = 0; |
678 | | |
679 | 207k | px = &xml_buf[ix]; // the tag's attributes |
680 | | |
681 | 207k | if (tag_name[0] == '/') { |
682 | | // closing tag |
683 | 5.02k | if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE) |
684 | 4.83k | outbuf[(*outix)++] = ' '; |
685 | 5.02k | tag_type += SSML_CLOSE; |
686 | 202k | } else { |
687 | 202k | if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) { |
688 | | // separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word) |
689 | 202k | outbuf[(*outix)++] = ' '; |
690 | 202k | } |
691 | | |
692 | 202k | if (self_closing && ignore_if_self_closing[tag_type]) |
693 | 50 | return 0; |
694 | 202k | } |
695 | | |
696 | 207k | voice_change_flag = 0; |
697 | 207k | ssml_sp = &ssml_stack[*n_ssml_stack-1]; |
698 | | |
699 | 207k | switch (tag_type) |
700 | 207k | { |
701 | 92 | case SSML_STYLE: |
702 | 92 | sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack); |
703 | 92 | attr1 = GetSsmlAttribute(px, "field"); |
704 | 92 | attr2 = GetSsmlAttribute(px, "mode"); |
705 | | |
706 | | |
707 | 92 | if (attrcmp(attr1, "punctuation") == 0) { |
708 | 0 | value = attrlookup(attr2, mnem_punct); |
709 | 0 | sp->parameter[espeakPUNCTUATION] = value; |
710 | 92 | } else if (attrcmp(attr1, "capital_letters") == 0) { |
711 | 0 | value = attrlookup(attr2, mnem_capitals); |
712 | 0 | sp->parameter[espeakCAPITALS] = value; |
713 | 0 | } |
714 | 92 | ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters); |
715 | 92 | break; |
716 | 1.13k | case SSML_PROSODY: |
717 | 1.13k | sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack); |
718 | | |
719 | | // look for attributes: rate, volume, pitch, range |
720 | 5.65k | for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) { |
721 | 4.52k | if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL) |
722 | 0 | SetProsodyParameter(param_type, attr1, sp, param_stack, speech_parameters); |
723 | 4.52k | } |
724 | | |
725 | 1.13k | ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters); |
726 | 1.13k | break; |
727 | 2.88k | case SSML_EMPHASIS: |
728 | 2.88k | sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack); |
729 | 2.88k | value = 3; // default is "moderate" |
730 | 2.88k | if ((attr1 = GetSsmlAttribute(px, "level")) != NULL) |
731 | 0 | value = attrlookup(attr1, mnem_emphasis); |
732 | | |
733 | 2.88k | if (translator->langopts.tone_language == 1) { |
734 | 2 | static const unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 }; |
735 | 2 | static const unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 }; |
736 | | // tone language (eg.Chinese) do emphasis by increasing the pitch range. |
737 | 2 | sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value]; |
738 | 2 | sp->parameter[espeakVOLUME] = emphasis_to_volume[value]; |
739 | 2.88k | } else { |
740 | 2.88k | static const unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 }; |
741 | 2.88k | sp->parameter[espeakVOLUME] = emphasis_to_volume2[value]; |
742 | 2.88k | sp->parameter[espeakEMPHASIS] = value; |
743 | 2.88k | } |
744 | 2.88k | ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters); |
745 | 2.88k | break; |
746 | 57 | case SSML_STYLE + SSML_CLOSE: |
747 | 461 | case SSML_PROSODY + SSML_CLOSE: |
748 | 1.23k | case SSML_EMPHASIS + SSML_CLOSE: |
749 | 1.23k | PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters); |
750 | 1.23k | break; |
751 | 63 | case SSML_PHONEME: |
752 | 63 | attr1 = GetSsmlAttribute(px, "alphabet"); |
753 | 63 | attr2 = GetSsmlAttribute(px, "ph"); |
754 | 63 | value = attrlookup(attr1, mnem_phoneme_alphabet); |
755 | 63 | if (value == 1) { // alphabet="espeak" |
756 | 0 | outbuf[(*outix)++] = '['; |
757 | 0 | outbuf[(*outix)++] = '['; |
758 | 0 | *outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix); |
759 | 0 | outbuf[(*outix)++] = ']'; |
760 | 0 | outbuf[(*outix)++] = ']'; |
761 | 0 | } |
762 | 63 | break; |
763 | 276 | case SSML_SAYAS: |
764 | 276 | attr1 = GetSsmlAttribute(px, "interpret-as"); |
765 | 276 | attr2 = GetSsmlAttribute(px, "format"); |
766 | 276 | attr3 = GetSsmlAttribute(px, "detail"); |
767 | 276 | value = attrlookup(attr1, mnem_interpret_as); |
768 | 276 | value2 = attrlookup(attr2, mnem_sayas_format); |
769 | 276 | if (value2 == 1) |
770 | 0 | value = SAYAS_GLYPHS; |
771 | | |
772 | 276 | value3 = attrnumber(attr3, 0, 0); |
773 | | |
774 | 276 | if (value == SAYAS_DIGITS) { |
775 | 0 | if (value3 <= 1) |
776 | 0 | value = SAYAS_DIGITS1; |
777 | 0 | else |
778 | 0 | value = SAYAS_DIGITS + value3; |
779 | 0 | } |
780 | | |
781 | 276 | sprintf(buf, "%c%dY", CTRL_EMBEDDED, value); |
782 | 276 | strcpy(&outbuf[*outix], buf); |
783 | 276 | *outix += strlen(buf); |
784 | | |
785 | 276 | *sayas_start = *outix; |
786 | 276 | *sayas_mode = value; // punctuation doesn't end clause during SAY-AS |
787 | 276 | break; |
788 | 15 | case SSML_SAYAS + SSML_CLOSE: |
789 | 15 | if (*sayas_mode == SAYAS_KEY) { |
790 | 0 | outbuf[*outix] = 0; |
791 | 0 | ReplaceKeyName(outbuf, *sayas_start, outix); |
792 | 0 | } |
793 | | |
794 | 15 | outbuf[(*outix)++] = CTRL_EMBEDDED; |
795 | 15 | outbuf[(*outix)++] = 'Y'; |
796 | 15 | *sayas_mode = 0; |
797 | 15 | break; |
798 | 324 | case SSML_SUB: |
799 | 324 | if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) { |
800 | | // use the alias rather than the text |
801 | 0 | *ignore_text = true; |
802 | 0 | *outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix); |
803 | 0 | } |
804 | 324 | break; |
805 | 8.00k | case SSML_IGNORE_TEXT: |
806 | 8.00k | *ignore_text = true; |
807 | 8.00k | break; |
808 | 312 | case SSML_SUB + SSML_CLOSE: |
809 | 879 | case SSML_IGNORE_TEXT + SSML_CLOSE: |
810 | 879 | *ignore_text = false; |
811 | 879 | break; |
812 | 488 | case SSML_MARK: |
813 | 488 | if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) { |
814 | | // add name to circular buffer of marker names |
815 | 0 | attrcopy_utf8(buf, attr1, sizeof(buf)); |
816 | |
|
817 | 0 | if ((buf[0] != 0) && (strcmp(skip_marker, buf) == 0)) { |
818 | | // This is the marker we are waiting for before starting to speak |
819 | 0 | *clear_skipping_text = true; |
820 | 0 | skip_marker[0] = 0; |
821 | 0 | return CLAUSE_NONE; |
822 | 0 | } |
823 | | |
824 | 0 | if ((index = AddNameData(buf, 0)) >= 0) { |
825 | 0 | sprintf(buf, "%c%dM", CTRL_EMBEDDED, index); |
826 | 0 | strcpy(&outbuf[*outix], buf); |
827 | 0 | *outix += strlen(buf); |
828 | 0 | } |
829 | 0 | } |
830 | 488 | break; |
831 | 1.38k | case SSML_AUDIO: |
832 | 1.38k | sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *)param_stack); |
833 | | |
834 | 1.38k | if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) { |
835 | 0 | attrcopy_utf8(buf, attr1, sizeof(buf)); |
836 | |
|
837 | 0 | if (uri_callback == NULL) { |
838 | 0 | if ((xmlbase != NULL) && (buf[0] != '/')) { |
839 | 0 | char fname[256]; |
840 | 0 | sprintf(fname, "%s/%s", xmlbase, buf); |
841 | 0 | index = LoadSoundFile2(fname); |
842 | 0 | } else |
843 | 0 | index = LoadSoundFile2(buf); |
844 | 0 | if (index >= 0) { |
845 | 0 | sprintf(buf, "%c%dI", CTRL_EMBEDDED, index); |
846 | 0 | strcpy(&outbuf[*outix], buf); |
847 | 0 | *outix += strlen(buf); |
848 | 0 | sp->parameter[espeakSILENCE] = 1; |
849 | 0 | } |
850 | 0 | } else { |
851 | 0 | if ((index = AddNameData(buf, 0)) >= 0) { |
852 | 0 | char *uri; |
853 | 0 | uri = &namedata[index]; |
854 | 0 | if (uri_callback(1, uri, xmlbase) == 0) { |
855 | 0 | sprintf(buf, "%c%dU", CTRL_EMBEDDED, index); |
856 | 0 | strcpy(&outbuf[*outix], buf); |
857 | 0 | *outix += strlen(buf); |
858 | 0 | sp->parameter[espeakSILENCE] = 1; |
859 | 0 | } |
860 | 0 | } |
861 | 0 | } |
862 | 0 | } |
863 | 1.38k | ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters); |
864 | | |
865 | 1.38k | if (self_closing) |
866 | 701 | PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters); |
867 | 683 | else |
868 | 683 | *audio_text = true; |
869 | 1.38k | return CLAUSE_NONE; |
870 | 533 | case SSML_AUDIO + SSML_CLOSE: |
871 | 533 | PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters); |
872 | 533 | *audio_text = false; |
873 | 533 | return CLAUSE_NONE; |
874 | 50 | case SSML_BREAK: |
875 | 50 | value = 21; |
876 | 50 | terminator = CLAUSE_NONE; |
877 | | |
878 | 50 | if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) { |
879 | 0 | static const int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS |
880 | 0 | value = attrlookup(attr1, mnem_break); |
881 | 0 | if (value < 0) value = 2; |
882 | 0 | if (value < 3) { |
883 | | // adjust prepause on the following word |
884 | 0 | sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value); |
885 | 0 | *outix += 3; |
886 | 0 | terminator = 0; |
887 | 0 | } |
888 | 0 | value = break_value[value]; |
889 | 0 | } |
890 | 50 | if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) { |
891 | 0 | value2 = attrnumber(attr2, 0, 1); // pause in mS |
892 | |
|
893 | 0 | value2 = value2 * speech_parameters[espeakSSML_BREAK_MUL] / 100; |
894 | |
|
895 | 0 | int wpm = speech_parameters[espeakRATE]; |
896 | 0 | espeak_SetParameter(espeakRATE, wpm, 0); |
897 | |
|
898 | 0 | #if USE_LIBSONIC |
899 | 0 | if (wpm >= espeakRATE_MAXIMUM) { |
900 | | // Compensate speedup with libsonic, see function SetSpeed() |
901 | 0 | double sonic = ((double)wpm)/espeakRATE_NORMAL; |
902 | 0 | value2 = value2 * sonic; |
903 | 0 | } |
904 | 0 | #endif |
905 | | |
906 | | // compensate for speaking speed to keep constant pause length, see function PauseLength() |
907 | | // 'value' here is x 10mS |
908 | 0 | value = (value2 * 256) / (speed.clause_pause_factor * 10); |
909 | 0 | if (value < 200) |
910 | 0 | value = (value2 * 256) / (speed.pause_factor * 10); |
911 | |
|
912 | 0 | if (terminator == 0) |
913 | 0 | terminator = CLAUSE_NONE; |
914 | 0 | } |
915 | 50 | if (terminator) { |
916 | 50 | if (value > 0xfff) { |
917 | | // scale down the value and set a scaling indicator bit |
918 | 0 | value = value / 32; |
919 | 0 | if (value > 0xfff) |
920 | 0 | value = 0xfff; |
921 | 0 | terminator |= CLAUSE_PAUSE_LONG; |
922 | 0 | } |
923 | 50 | return terminator + value; |
924 | 50 | } |
925 | 0 | break; |
926 | 179 | case SSML_SPEAK: |
927 | 179 | if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) { |
928 | 0 | attrcopy_utf8(buf, attr1, sizeof(buf)); |
929 | 0 | if ((index = AddNameData(buf, 0)) >= 0) |
930 | 0 | xmlbase = &namedata[index]; |
931 | 0 | } |
932 | 179 | if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0) |
933 | 179 | return 0; // no voice change |
934 | 0 | return CLAUSE_VOICE; |
935 | 3.96k | case SSML_VOICE: |
936 | 3.96k | if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0) |
937 | 3.71k | return 0; // no voice change |
938 | 248 | return CLAUSE_VOICE; |
939 | 188 | case SSML_SPEAK + SSML_CLOSE: |
940 | | // unwind stack until the previous <voice> or <speak> tag |
941 | 188 | while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_SPEAK)) |
942 | 0 | (*n_ssml_stack)--; |
943 | 188 | return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
944 | 177 | case SSML_VOICE + SSML_CLOSE: |
945 | | // unwind stack until the previous <voice> or <speak> tag |
946 | 177 | while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_VOICE)) |
947 | 0 | (*n_ssml_stack)--; |
948 | | |
949 | 177 | terminator = 0; // ?? Sentence intonation, but no pause ?? |
950 | 177 | return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
951 | 353 | case HTML_BREAK: |
952 | 538 | case HTML_BREAK + SSML_CLOSE: |
953 | 538 | return CLAUSE_COLON; |
954 | 549 | case SSML_SENTENCE: |
955 | 549 | if (ssml_sp->tag_type == SSML_SENTENCE) { |
956 | | // new sentence implies end-of-sentence |
957 | 0 | voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
958 | 0 | } |
959 | 549 | voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
960 | 549 | return CLAUSE_PARAGRAPH + voice_change_flag; |
961 | 13.8k | case SSML_PARAGRAPH: |
962 | 13.8k | if (ssml_sp->tag_type == SSML_SENTENCE) { |
963 | | // new paragraph implies end-of-sentence or end-of-paragraph |
964 | 0 | voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
965 | 0 | } |
966 | 13.8k | if (ssml_sp->tag_type == SSML_PARAGRAPH) { |
967 | | // new paragraph implies end-of-sentence or end-of-paragraph |
968 | 0 | voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
969 | 0 | } |
970 | 13.8k | voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
971 | 13.8k | return CLAUSE_PARAGRAPH + voice_change_flag; |
972 | 274 | case SSML_SENTENCE + SSML_CLOSE: |
973 | 274 | if (ssml_sp->tag_type == SSML_SENTENCE) { |
974 | | // end of a sentence which specified a language |
975 | 0 | voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name); |
976 | 0 | } |
977 | 274 | return CLAUSE_PERIOD + voice_change_flag; |
978 | 546 | case SSML_PARAGRAPH + SSML_CLOSE: |
979 | 546 | if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) { |
980 | | // End of a paragraph which specified a language. |
981 | | // (End-of-paragraph also implies end-of-sentence) |
982 | 0 | return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH; |
983 | 0 | } |
984 | 546 | return CLAUSE_PARAGRAPH; |
985 | 207k | } |
986 | 185k | return 0; |
987 | 207k | } |
988 | | |
989 | | #pragma GCC visibility push(default) |
990 | | ESPEAK_API void espeak_SetUriCallback(int (*UriCallback)(int, const char *, const char *)) |
991 | 0 | { |
992 | 0 | uri_callback = UriCallback; |
993 | 0 | } |
994 | | #pragma GCC visibility pop |
995 | | |
996 | | static const MNEM_TAB xml_entity_mnemonics[] = { |
997 | | { "gt", '>' }, |
998 | | { "lt", 0xe000 + '<' }, // private usage area, to avoid confusion with XML tag |
999 | | { "amp", '&' }, |
1000 | | { "quot", '"' }, |
1001 | | { "nbsp", ' ' }, |
1002 | | { "apos", '\'' }, |
1003 | | { NULL, -1 } |
1004 | | }; |
1005 | | |
1006 | 5.36k | int ParseSsmlReference(char *ref, int *c1, int *c2) { |
1007 | | // Check if buffer *ref contains an XML character or entity reference |
1008 | | // if found, set *c1 to the replacement char |
1009 | | // change *c2 for entity references |
1010 | | // returns >= 0 on success |
1011 | | |
1012 | 5.36k | if (ref[0] == '#') { |
1013 | | // character reference |
1014 | 4.14k | if (ref[1] == 'x') |
1015 | 345 | return sscanf(&ref[2], "%x", (unsigned int *)c1); |
1016 | 3.79k | else |
1017 | 3.79k | return sscanf(&ref[1], "%d", c1); |
1018 | 4.14k | } else { |
1019 | | // entity reference |
1020 | 1.22k | int found; |
1021 | 1.22k | if ((found = LookupMnem(xml_entity_mnemonics, ref)) != -1) { |
1022 | 619 | *c1 = found; |
1023 | 619 | if (*c2 == 0) |
1024 | 6 | *c2 = ' '; |
1025 | 619 | return found; |
1026 | 619 | } |
1027 | 1.22k | } |
1028 | 602 | return -1; |
1029 | 5.36k | } |