Coverage Report

Created: 2026-06-13 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/espeak-ng/src/libespeak-ng/ssml.c
Line
Count
Source
1
/*
2
 * Copyright (C) 2005 to 2015 by Jonathan Duddington
3
 * email: jonsd@users.sourceforge.net
4
 * Copyright (C) 2015-2017 Reece H. Dunn
5
 * Copyright (C) 2018 Juho Hiltunen
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * This program is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program; if not, see: <http://www.gnu.org/licenses/>.
19
 */
20
21
22
#include "config.h"
23
24
#include <ctype.h>
25
#include <errno.h>
26
#include <locale.h>
27
#include <math.h>
28
#include <stdint.h>
29
#include <stdio.h>
30
#include <stdlib.h>
31
#include <string.h>
32
#include <unistd.h>
33
#include <wchar.h>
34
#include <wctype.h>
35
36
37
#include <espeak-ng/espeak_ng.h>
38
#include <espeak-ng/speak_lib.h>
39
#include <espeak-ng/encoding.h>
40
#include <ucd/ucd.h>
41
42
#include "ssml.h"
43
#include "common.h"           // for strncpy0
44
#include "mnemonics.h"               // for LookupMnemName, MNEM_TAB, 
45
#include "readclause.h"           // for PARAM_STACK, param_stack, AddNameData
46
#include "soundicon.h"               // for LoadSoundFile2
47
#include "synthesize.h"           // for SPEED_FACTORS, speed
48
#include "translate.h"            // for CTRL_EMBEDDED
49
#include "voice.h"                // for SelectVoice, SelectVoiceByName
50
#include "speech.h"               // for MAKE_MEM_UNDEFINED
51
52
static const MNEM_TAB ssmltags[] = {
53
  { "speak",     SSML_SPEAK },
54
  { "voice",     SSML_VOICE },
55
  { "prosody",   SSML_PROSODY },
56
  { "say-as",    SSML_SAYAS },
57
  { "mark",      SSML_MARK },
58
  { "s",         SSML_SENTENCE },
59
  { "p",         SSML_PARAGRAPH },
60
  { "phoneme",   SSML_PHONEME },
61
  { "sub",       SSML_SUB },
62
  { "tts:style", SSML_STYLE },
63
  { "audio",     SSML_AUDIO },
64
  { "emphasis",  SSML_EMPHASIS },
65
  { "break",     SSML_BREAK },
66
  { "metadata",  SSML_IGNORE_TEXT },
67
68
  { "br",     HTML_BREAK },
69
  { "li",     HTML_BREAK },
70
  { "dd",     HTML_BREAK },
71
  { "img",    HTML_BREAK },
72
  { "td",     HTML_BREAK },
73
  { "h1",     SSML_PARAGRAPH },
74
  { "h2",     SSML_PARAGRAPH },
75
  { "h3",     SSML_PARAGRAPH },
76
  { "h4",     SSML_PARAGRAPH },
77
  { "hr",     SSML_PARAGRAPH },
78
  { "script", SSML_IGNORE_TEXT },
79
  { "style",  SSML_IGNORE_TEXT },
80
  { "font",   HTML_NOSPACE },
81
  { "b",      HTML_NOSPACE },
82
  { "i",      HTML_NOSPACE },
83
  { "strong", HTML_NOSPACE },
84
  { "em",     HTML_NOSPACE },
85
  { "code",   HTML_NOSPACE },
86
87
  { NULL, 0 }
88
};
89
90
static int (*uri_callback)(int, const char *, const char *) = NULL;
91
92
static int attrcmp(const wchar_t *string1, const char *string2)
93
48.0k
{
94
48.0k
  int ix;
95
96
48.0k
  if (string1 == NULL)
97
48.0k
    return 1;
98
99
0
  for (ix = 0; (string1[ix] == string2[ix]) && (string1[ix] != 0); ix++)
100
0
    ;
101
0
  if (((string1[ix] == '"') || (string1[ix] == '\'')) && (string2[ix] == 0))
102
0
    return 0;
103
0
  return 1;
104
0
}
105
106
107
static int attrlookup(const wchar_t *string1, const MNEM_TAB *mtab)
108
16.0k
{
109
16.0k
  int ix;
110
111
63.9k
  for (ix = 0; mtab[ix].mnem != NULL; ix++) {
112
47.8k
    if (attrcmp(string1, mtab[ix].mnem) == 0)
113
0
      return mtab[ix].value;
114
47.8k
  }
115
16.0k
  return mtab[ix].value;
116
16.0k
}
117
118
static int attrnumber(const wchar_t *pw, int default_value, int type)
119
31.0k
{
120
31.0k
  int value = 0;
121
122
31.0k
  if ((pw == NULL) || !IsDigit09(*pw))
123
31.0k
    return default_value;
124
125
0
  while (IsDigit09(*pw))
126
0
    value = value*10 + *pw++ - '0';
127
0
  if ((type == 1) && (ucd_tolower(*pw) == 's')) {
128
    // time: seconds rather than ms
129
0
    value *= 1000;
130
0
  }
131
0
  return value;
132
31.0k
}
133
134
static int attrcopy_utf8(char *buf, const wchar_t *pw, int len)
135
30.7k
{
136
  // Convert attribute string into utf8, write to buf, and return its utf8 length
137
30.7k
  int ix = 0;
138
139
30.7k
  if (pw != NULL) {
140
11.4k
    wchar_t quote = pw[-1];
141
11.4k
    if ((quote != '"') && (quote != '\'')) quote = 0;
142
143
11.4k
    unsigned int c;
144
11.4k
    int prev_c = 0;
145
57.5k
    while ((ix < (len-4)) && ((c = *pw++) != 0)) {
146
50.0k
      if ((quote == 0) && (isspace(c) || (c == '/')))
147
1.84k
        break;
148
48.1k
      if ((quote != 0) && (c == quote) && (prev_c != '\\'))
149
2.06k
        break; // " indicates end of attribute, unless preceded by backstroke
150
151
46.1k
      int n = utf8_out(c, &buf[ix]);
152
46.1k
      ix += n;
153
46.1k
      prev_c = c;
154
46.1k
    }
155
11.4k
  }
156
30.7k
  buf[ix] = 0;
157
30.7k
  return ix;
158
30.7k
}
159
160
static int attr_prosody_value(int param_type, const wchar_t *pw, int *value_out)
161
0
{
162
0
  int sign = 0;
163
0
  wchar_t *tail;
164
0
  double value;
165
166
0
  while (iswspace(*pw)) pw++;
167
0
  if (*pw == '+') {
168
0
    pw++;
169
0
    sign = 1;
170
0
  }
171
0
  if (*pw == '-') {
172
0
    pw++;
173
0
    sign = -1;
174
0
  }
175
0
  value = (double)wcstod(pw, &tail);
176
0
  if (tail == pw) {
177
    // failed to find a number, return 100%
178
0
    *value_out = 100;
179
0
    return 2;
180
0
  }
181
182
0
  if (*tail == '%') {
183
0
    if (sign != 0)
184
0
      value = 100 + (sign * value);
185
0
    *value_out = (int)value;
186
0
    return 2; // percentage
187
0
  }
188
189
0
  if ((tail[0] == 's') && (tail[1] == 't')) {
190
0
    double x;
191
    // convert from semitones to a  frequency percentage
192
0
    x = pow((double)2.0, (double)((value*sign)/12)) * 100;
193
0
    *value_out = (int)x;
194
0
    return 2; // percentage
195
0
  }
196
197
0
  if (param_type == espeakRATE) {
198
0
    if (sign == 0)
199
0
      *value_out = (int)(value * 100);
200
0
    else
201
0
      *value_out = 100 + (int)(sign * value * 100);
202
0
    return 2; // percentage
203
0
  }
204
205
0
  *value_out = (int)value;
206
0
  return sign;   // -1, 0, or 1
207
0
}
208
209
static const char *VoiceFromStack(SSML_STACK *ssml_stack, int n_ssml_stack, espeak_VOICE *base_voice, char base_voice_variant_name[40])
210
15.7k
{
211
  // Use the voice properties from the SSML stack to choose a voice, and switch
212
  // to that voice if it's not the current voice
213
214
15.7k
  int ix;
215
15.7k
  const char *p;
216
15.7k
  SSML_STACK *sp;
217
15.7k
  const char *v_id;
218
15.7k
  int voice_found;
219
15.7k
  espeak_VOICE voice_select;
220
15.7k
  static char voice_name[40];
221
15.7k
  static char identifier[40];
222
15.7k
  char language[40];
223
224
15.7k
  MAKE_MEM_UNDEFINED(&voice_name, sizeof(voice_name));
225
226
15.7k
  strcpy(voice_name, ssml_stack[0].voice_name);
227
15.7k
  strcpy(language, ssml_stack[0].language);
228
15.7k
  voice_select.age = ssml_stack[0].voice_age;
229
15.7k
  voice_select.gender = ssml_stack[0].voice_gender;
230
15.7k
  voice_select.variant = ssml_stack[0].voice_variant_number;
231
15.7k
  voice_select.identifier = NULL;
232
233
46.9k
  for (ix = 0; ix < n_ssml_stack; ix++) {
234
31.1k
    espeak_VOICE *v;
235
31.1k
    sp = &ssml_stack[ix];
236
31.1k
    int voice_name_specified = 0;
237
238
31.1k
    if ((sp->voice_name[0] != 0) && ((v = SelectVoiceByName(NULL, sp->voice_name)) != NULL)) {
239
15.7k
      voice_name_specified = 1;
240
15.7k
      strcpy(voice_name, sp->voice_name);
241
15.7k
      strcpy(identifier, v->identifier);
242
15.7k
      language[0] = 0;
243
15.7k
      voice_select.gender = ENGENDER_UNKNOWN;
244
15.7k
      voice_select.age = 0;
245
15.7k
      voice_select.variant = 0;
246
15.7k
    }
247
31.1k
    if (sp->language[0] != 0) {
248
27.1k
      strcpy(language, sp->language);
249
250
      // is this language provided by the base voice?
251
27.1k
      p = base_voice->languages;
252
59.3k
      while (*p++ != 0) {
253
40.8k
        if (strcmp(p, language) == 0) {
254
          // yes, change the language to the main language of the base voice
255
8.66k
          strcpy(language, &base_voice->languages[1]);
256
8.66k
          break;
257
8.66k
        }
258
32.1k
        p += (strlen(p) + 1);
259
32.1k
      }
260
261
27.1k
      if (voice_name_specified == 0)
262
11.4k
      {
263
11.4k
        voice_name[0] = 0; // forget a previous voice name if a language is specified
264
11.4k
        identifier[0] = 0;
265
11.4k
      }
266
27.1k
    }
267
31.1k
    if (sp->voice_gender != ENGENDER_UNKNOWN)
268
0
      voice_select.gender = sp->voice_gender;
269
270
31.1k
    if (sp->voice_age != 0)
271
0
      voice_select.age = sp->voice_age;
272
31.1k
    if (sp->voice_variant_number != 0)
273
0
      voice_select.variant = sp->voice_variant_number;
274
31.1k
  }
275
276
15.7k
  voice_select.name = voice_name;
277
15.7k
  voice_select.identifier = identifier;
278
15.7k
  voice_select.languages = language;
279
280
15.7k
  v_id = SelectVoice(&voice_select, &voice_found);
281
15.7k
  if (v_id == NULL)
282
0
    return "default";
283
284
15.7k
  if ((strchr(v_id, '+') == NULL) && ((voice_select.gender == ENGENDER_UNKNOWN) || (voice_select.gender == base_voice->gender)) && (base_voice_variant_name[0] != 0)) {
285
    // a voice variant has not been selected, use the original voice variant
286
0
    char buf[80];
287
0
    sprintf(buf, "%s+%s", v_id, base_voice_variant_name);
288
0
    strncpy0(voice_name, buf, sizeof(voice_name));
289
0
    return voice_name;
290
0
  }
291
15.7k
  return v_id;
292
15.7k
}
293
294
295
static const wchar_t *GetSsmlAttribute(wchar_t *pw, const char *name)
296
45.4k
{
297
  // Gets the value string for an attribute.
298
  // Returns NULL if the attribute is not present
299
300
45.4k
  int ix;
301
45.4k
  static const wchar_t empty[1] = { 0 };
302
303
83.3k
  while (*pw != 0) {
304
49.3k
    if (iswspace(pw[-1])) {
305
15.4k
      ix = 0;
306
112k
      while (*pw == name[ix]) {
307
96.6k
        pw++;
308
96.6k
        ix++;
309
96.6k
      }
310
15.4k
      if (name[ix] == 0) {
311
        // found the attribute, now get the value
312
11.9k
        while (iswspace(*pw)) pw++;
313
11.4k
        if (*pw == '=') pw++;
314
12.0k
        while (iswspace(*pw)) pw++;
315
11.4k
        if ((*pw == '"') || (*pw == '\'')) // allow single-quotes ?
316
6.61k
          return pw+1;
317
4.81k
        else if (iswspace(*pw) || (*pw == '/')) // end of attribute
318
0
          return empty;
319
4.81k
        else
320
4.81k
          return pw;
321
11.4k
      }
322
15.4k
    }
323
37.9k
    pw++;
324
37.9k
  }
325
33.9k
  return NULL;
326
45.4k
}
327
328
329
static int GetVoiceAttributes(wchar_t *pw, int tag_type, SSML_STACK *ssml_sp, SSML_STACK *ssml_stack, int n_ssml_stack, char current_voice_id[40], espeak_VOICE *base_voice, char *base_voice_variant_name)
330
18.8k
{
331
  // Determines whether voice attribute are specified in this tag, and if so, whether this means
332
  // a voice change.
333
  // If it's a closing tag, delete the top frame of the stack and determine whether this implies
334
  // a voice change.
335
  // Returns  CLAUSE_TYPE_VOICE_CHANGE if there is a voice change
336
337
18.8k
  const char *new_voice_id;
338
339
18.8k
  static const MNEM_TAB mnem_gender[] = {
340
18.8k
    { "male", ENGENDER_MALE },
341
18.8k
    { "female", ENGENDER_FEMALE },
342
18.8k
    { "neutral", ENGENDER_NEUTRAL },
343
18.8k
    { NULL, ENGENDER_UNKNOWN }
344
18.8k
  };
345
346
18.8k
  if (tag_type & SSML_CLOSE) {
347
    // delete a stack frame
348
365
    if (n_ssml_stack > 1)
349
0
      n_ssml_stack--;
350
18.5k
  } else {
351
18.5k
    const wchar_t *lang;
352
18.5k
      const wchar_t *gender;
353
18.5k
      const wchar_t *name;
354
18.5k
      const wchar_t *age;
355
18.5k
      const wchar_t *variant;
356
357
    // add a stack frame if any voice details are specified
358
18.5k
    lang = GetSsmlAttribute(pw, "xml:lang");
359
360
18.5k
    if (tag_type != SSML_VOICE) {
361
      // only expect an xml:lang attribute
362
14.5k
      name = NULL;
363
14.5k
      variant = NULL;
364
14.5k
      age = NULL;
365
14.5k
      gender = NULL;
366
14.5k
    } else {
367
3.96k
      name = GetSsmlAttribute(pw, "name");
368
3.96k
      variant = GetSsmlAttribute(pw, "variant");
369
3.96k
      age = GetSsmlAttribute(pw, "age");
370
3.96k
      gender = GetSsmlAttribute(pw, "gender");
371
3.96k
    }
372
373
18.5k
    if ((tag_type != SSML_VOICE) && (lang == NULL))
374
3.12k
      return 0; // <s> or <p> without language spec, nothing to do
375
376
15.3k
    ssml_sp = &ssml_stack[n_ssml_stack++];
377
378
15.3k
    int value;
379
380
15.3k
    attrcopy_utf8(ssml_sp->language, lang, sizeof(ssml_sp->language));
381
15.3k
    attrcopy_utf8(ssml_sp->voice_name, name, sizeof(ssml_sp->voice_name));
382
15.3k
    if ((value = attrnumber(variant, 1, 0)) > 0)
383
15.3k
      value--; // variant='0' and variant='1' the same
384
15.3k
    ssml_sp->voice_variant_number = value;
385
15.3k
    ssml_sp->voice_age = attrnumber(age, 0, 0);
386
15.3k
    ssml_sp->voice_gender = attrlookup(gender, mnem_gender);
387
15.3k
    ssml_sp->tag_type = tag_type;
388
15.3k
  }
389
390
15.7k
  new_voice_id = VoiceFromStack(ssml_stack, n_ssml_stack, base_voice, base_voice_variant_name);
391
15.7k
  if (strcmp(new_voice_id, current_voice_id) != 0) {
392
    // add an embedded command to change the voice
393
9.01k
    strcpy(current_voice_id, new_voice_id);
394
9.01k
    return CLAUSE_TYPE_VOICE_CHANGE;
395
9.01k
  }
396
397
6.74k
  return 0;
398
15.7k
}
399
400
static void ProcessParamStack(char *outbuf, int *outix, int n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
401
7.96k
{
402
  // Set the speech parameters from the parameter stack
403
7.96k
  int param;
404
7.96k
  int ix;
405
7.96k
  char buf[20];
406
7.96k
  int new_parameters[N_SPEECH_PARAM];
407
7.96k
  static const char cmd_letter[N_SPEECH_PARAM] = { 0, 'S', 'A', 'P', 'R', 0, 'C', 0, 0, 0, 0, 0, 'F' }; // embedded command letters
408
409
127k
  for (param = 0; param < N_SPEECH_PARAM; param++)
410
119k
    new_parameters[param] = -1;
411
412
87.0k
  for (ix = 0; ix < n_param_stack; ix++) {
413
1.26M
    for (param = 0; param < N_SPEECH_PARAM; param++) {
414
1.18M
      if (param_stack[ix].parameter[param] >= 0)
415
205k
        new_parameters[param] = param_stack[ix].parameter[param];
416
1.18M
    }
417
79.1k
  }
418
419
127k
  for (param = 0; param < N_SPEECH_PARAM; param++) {
420
119k
    int value;
421
119k
    if ((value = new_parameters[param]) != speech_parameters[param]) {
422
513
      buf[0] = 0;
423
424
513
      switch (param)
425
513
      {
426
0
      case espeakPUNCTUATION:
427
0
        option_punctuation = value-1;
428
0
        break;
429
0
      case espeakCAPITALS:
430
0
        option_capitals = value;
431
0
        break;
432
0
      case espeakRATE:
433
3
      case espeakVOLUME:
434
3
      case espeakPITCH:
435
6
      case espeakRANGE:
436
513
      case espeakEMPHASIS:
437
513
        sprintf(buf, "%c%d%c", CTRL_EMBEDDED, value, cmd_letter[param]);
438
513
        break;
439
513
      }
440
441
513
      speech_parameters[param] = new_parameters[param];
442
513
      strcpy(&outbuf[*outix], buf);
443
513
      *outix += strlen(buf);
444
513
    }
445
119k
  }
446
7.96k
}
447
448
static PARAM_STACK *PushParamStack(int tag_type, int *n_param_stack, PARAM_STACK *param_stack)
449
5.49k
{
450
5.49k
  int ix;
451
5.49k
  PARAM_STACK *sp;
452
453
5.49k
  sp = &param_stack[*n_param_stack];
454
5.49k
  if (*n_param_stack < (N_PARAM_STACK-1))
455
2.58k
    (*n_param_stack)++;
456
457
5.49k
  sp->type = tag_type;
458
87.9k
  for (ix = 0; ix < N_SPEECH_PARAM; ix++)
459
82.4k
    sp->parameter[ix] = -1;
460
5.49k
  return sp;
461
5.49k
}
462
463
static void PopParamStack(int tag_type, char *outbuf, int *outix, int *n_param_stack, PARAM_STACK *param_stack, int *speech_parameters)
464
2.47k
{
465
  // unwind the stack up to and including the previous tag of this type
466
2.47k
  int ix;
467
2.47k
  int top = 0;
468
469
2.47k
  if (tag_type >= SSML_CLOSE)
470
1.77k
    tag_type -= SSML_CLOSE;
471
472
10.9k
  for (ix = 0; ix < *n_param_stack; ix++) {
473
8.44k
    if (param_stack[ix].type == tag_type)
474
5.55k
      top = ix;
475
8.44k
  }
476
2.47k
  if (top > 0)
477
1.57k
    *n_param_stack = top;
478
2.47k
  ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
479
2.47k
}
480
481
static int ReplaceKeyName(char *outbuf, int index, int *outix)
482
0
{
483
  // Replace some key-names by single characters, so they can be pronounced in different languages
484
0
  static const MNEM_TAB keynames[] = {
485
0
    { "space ",        0xe020 },
486
0
    { "tab ",          0xe009 },
487
0
    { "underscore ",   0xe05f },
488
0
    { "double-quote ", '"' },
489
0
    { NULL,            0 }
490
0
  };
491
492
0
  int letter;
493
0
  char *p;
494
495
0
  p = &outbuf[index];
496
497
0
  if ((letter = LookupMnem(keynames, p)) != 0) {
498
0
    int ix;
499
0
     ix = utf8_out(letter, p);
500
0
    *outix = index + ix;
501
0
    return letter;
502
0
  }
503
0
  return 0;
504
0
}
505
506
static void SetProsodyParameter(int param_type, const wchar_t *attr1, PARAM_STACK *sp, PARAM_STACK *param_stack, int *speech_parameters)
507
0
{
508
0
  int value;
509
510
511
0
  static const MNEM_TAB mnem_volume[] = {
512
0
    { "default", 100 },
513
0
    { "silent",    0 },
514
0
    { "x-soft",   30 },
515
0
    { "soft",     65 },
516
0
    { "medium",  100 },
517
0
    { "loud",    150 },
518
0
    { "x-loud",  230 },
519
0
    { NULL,       -1 }
520
0
  };
521
522
0
  static const MNEM_TAB mnem_rate[] = {
523
0
    { "default", 100 },
524
0
    { "x-slow",   60 },
525
0
    { "slow",     80 },
526
0
    { "medium",  100 },
527
0
    { "fast",    125 },
528
0
    { "x-fast",  160 },
529
0
    { NULL,       -1 }
530
0
  };
531
532
0
  static const MNEM_TAB mnem_pitch[] = {
533
0
    { "default", 100 },
534
0
    { "x-low",    70 },
535
0
    { "low",      85 },
536
0
    { "medium",  100 },
537
0
    { "high",    110 },
538
0
    { "x-high",  120 },
539
0
    { NULL,       -1 }
540
0
  };
541
542
0
  static const MNEM_TAB mnem_range[] = {
543
0
    { "default", 100 },
544
0
    { "x-low",    20 },
545
0
    { "low",      50 },
546
0
    { "medium",  100 },
547
0
    { "high",    140 },
548
0
    { "x-high",  180 },
549
0
    { NULL,       -1 }
550
0
  };
551
552
0
  static const MNEM_TAB * const mnem_tabs[5] = {
553
0
    NULL, mnem_rate, mnem_volume, mnem_pitch, mnem_range
554
0
  };
555
556
0
  if ((value = attrlookup(attr1, mnem_tabs[param_type])) >= 0) {
557
    // mnemonic specifies a value as a percentage of the base pitch/range/rate/volume
558
0
    sp->parameter[param_type] = (param_stack[0].parameter[param_type] * value)/100;
559
0
  } else {
560
0
    int sign = attr_prosody_value(param_type, attr1, &value);
561
562
0
    if (sign == 0)
563
0
      sp->parameter[param_type] = value; // absolute value in Hz
564
0
    else if (sign == 2) {
565
      // change specified as percentage or in semitones
566
0
      sp->parameter[param_type] = (speech_parameters[param_type] * value)/100;
567
0
    } else {
568
      // change specified as plus or minus Hz
569
0
      sp->parameter[param_type] = speech_parameters[param_type] + (value*sign);
570
0
    }
571
0
  }
572
0
}
573
574
int ProcessSsmlTag(wchar_t *xml_buf, char *outbuf, int *outix, int n_outbuf, const char *xmlbase, bool *audio_text, char *current_voice_id, espeak_VOICE *base_voice, char *base_voice_variant_name, bool *ignore_text, bool *clear_skipping_text, int *sayas_mode, int *sayas_start, SSML_STACK *ssml_stack, int *n_ssml_stack, int *n_param_stack, int *speech_parameters)
575
207k
{
576
  // xml_buf is the tag and attributes with a zero terminator in place of the original '>'
577
  // returns a clause terminator value.
578
579
207k
  unsigned int ix;
580
207k
  int index;
581
207k
  int tag_type;
582
207k
  int value;
583
207k
  int value2;
584
207k
  int value3;
585
207k
  int voice_change_flag;
586
207k
  wchar_t *px;
587
207k
  const wchar_t *attr1;
588
207k
  const wchar_t *attr2;
589
207k
  const wchar_t *attr3;
590
207k
  int terminator;
591
207k
  int param_type;
592
207k
  char tag_name[40];
593
207k
  char buf[160];
594
207k
  PARAM_STACK *sp;
595
207k
  SSML_STACK *ssml_sp;
596
597
  // don't process comments and xml declarations
598
207k
  if (wcsncmp(xml_buf, (wchar_t *) "!--", 3) == 0 || wcsncmp(xml_buf, (wchar_t *) "?xml", 4) == 0) {
599
0
    return 0;
600
0
    }
601
602
  // these tags have no effect if they are self-closing, eg. <voice />
603
207k
  static const char ignore_if_self_closing[] = { 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0 };
604
605
207k
  bool self_closing = false;
606
207k
  int len;
607
207k
  len = wcslen(xml_buf);
608
207k
  if (xml_buf[len - 1] == '/') {
609
    // a self-closing tag
610
1.43k
    xml_buf[len - 1] = ' ';
611
1.43k
    self_closing = true;
612
1.43k
  }
613
614
207k
  static const MNEM_TAB mnem_phoneme_alphabet[] = {
615
207k
    { "espeak", 1 },
616
207k
    { NULL,    -1 }
617
207k
  };
618
619
207k
  static const MNEM_TAB mnem_punct[] = {
620
207k
    { "none", 1 },
621
207k
    { "all",  2 },
622
207k
    { "some", 3 },
623
207k
    { NULL,  -1 }
624
207k
  };
625
626
207k
  static const MNEM_TAB mnem_capitals[] = {
627
207k
    { "no",        0 },
628
207k
    { "icon",      1 },
629
207k
    { "spelling",  2 },
630
207k
    { "pitch",    20 },  // this is the amount by which to raise the pitch
631
207k
    { NULL,       -1 }
632
207k
  };
633
634
207k
  static const MNEM_TAB mnem_interpret_as[] = {
635
207k
    { "characters", SAYAS_CHARS },
636
207k
    { "tts:char",   SAYAS_SINGLE_CHARS },
637
207k
    { "tts:key",    SAYAS_KEY },
638
207k
    { "tts:digits", SAYAS_DIGITS },
639
207k
    { "telephone",  SAYAS_DIGITS1 },
640
207k
    { NULL,         -1 }
641
207k
  };
642
643
207k
  static const MNEM_TAB mnem_sayas_format[] = {
644
207k
    { "glyphs", 1 },
645
207k
    { NULL,    -1 }
646
207k
  };
647
648
207k
  static const MNEM_TAB mnem_break[] = {
649
207k
    { "none",     0 },
650
207k
    { "x-weak",   1 },
651
207k
    { "weak",     2 },
652
207k
    { "medium",   3 },
653
207k
    { "strong",   4 },
654
207k
    { "x-strong", 5 },
655
207k
    { NULL,      -1 }
656
207k
  };
657
658
207k
  static const MNEM_TAB mnem_emphasis[] = {
659
207k
    { "none",     1 },
660
207k
    { "reduced",  2 },
661
207k
    { "moderate", 3 },
662
207k
    { "strong",   4 },
663
207k
    { "x-strong", 5 },
664
207k
    { NULL,      -1 }
665
207k
  };
666
667
207k
  static const char * const prosody_attr[5] = {
668
207k
    NULL, "rate", "volume", "pitch", "range"
669
207k
  };
670
671
589k
  for (ix = 0; ix < (sizeof(tag_name)-1); ix++) {
672
589k
    int c;
673
589k
    if (((c = xml_buf[ix]) == 0) || iswspace(c))
674
207k
      break;
675
381k
    tag_name[ix] = tolower((char)c);
676
381k
  }
677
207k
  tag_name[ix] = 0;
678
679
207k
  px = &xml_buf[ix]; // the tag's attributes
680
681
207k
  if (tag_name[0] == '/') {
682
    // closing tag
683
5.02k
    if ((tag_type = LookupMnem(ssmltags, &tag_name[1])) != HTML_NOSPACE)
684
4.83k
      outbuf[(*outix)++] = ' ';
685
5.02k
    tag_type += SSML_CLOSE;
686
202k
  } else {
687
202k
    if ((tag_type = LookupMnem(ssmltags, tag_name)) != HTML_NOSPACE) {
688
      // separate SSML tags from the previous word (but not HMTL tags such as <b> <font> which can occur inside a word)
689
202k
      outbuf[(*outix)++] = ' ';
690
202k
    }
691
692
202k
    if (self_closing && ignore_if_self_closing[tag_type])
693
50
      return 0;
694
202k
  }
695
696
207k
  voice_change_flag = 0;
697
207k
  ssml_sp = &ssml_stack[*n_ssml_stack-1];
698
699
207k
  switch (tag_type)
700
207k
  {
701
92
  case SSML_STYLE:
702
92
    sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
703
92
    attr1 = GetSsmlAttribute(px, "field");
704
92
    attr2 = GetSsmlAttribute(px, "mode");
705
706
707
92
    if (attrcmp(attr1, "punctuation") == 0) {
708
0
      value = attrlookup(attr2, mnem_punct);
709
0
      sp->parameter[espeakPUNCTUATION] = value;
710
92
    } else if (attrcmp(attr1, "capital_letters") == 0) {
711
0
      value = attrlookup(attr2, mnem_capitals);
712
0
      sp->parameter[espeakCAPITALS] = value;
713
0
    }
714
92
    ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
715
92
    break;
716
1.13k
  case SSML_PROSODY:
717
1.13k
    sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
718
719
    // look for attributes:  rate, volume, pitch, range
720
5.65k
    for (param_type = espeakRATE; param_type <= espeakRANGE; param_type++) {
721
4.52k
      if ((attr1 = GetSsmlAttribute(px, prosody_attr[param_type])) != NULL)
722
0
        SetProsodyParameter(param_type, attr1, sp, param_stack, speech_parameters);
723
4.52k
    }
724
725
1.13k
    ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
726
1.13k
    break;
727
2.88k
  case SSML_EMPHASIS:
728
2.88k
    sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *) param_stack);
729
2.88k
    value = 3; // default is "moderate"
730
2.88k
    if ((attr1 = GetSsmlAttribute(px, "level")) != NULL)
731
0
      value = attrlookup(attr1, mnem_emphasis);
732
733
2.88k
    if (translator->langopts.tone_language == 1) {
734
2
      static const unsigned char emphasis_to_pitch_range[] = { 50, 50, 40, 70, 90, 100 };
735
2
      static const unsigned char emphasis_to_volume[] = { 100, 100, 70, 110, 135, 150 };
736
      // tone language (eg.Chinese) do emphasis by increasing the pitch range.
737
2
      sp->parameter[espeakRANGE] = emphasis_to_pitch_range[value];
738
2
      sp->parameter[espeakVOLUME] = emphasis_to_volume[value];
739
2.88k
    } else {
740
2.88k
      static const unsigned char emphasis_to_volume2[] = { 100, 100, 75, 100, 120, 150 };
741
2.88k
      sp->parameter[espeakVOLUME] = emphasis_to_volume2[value];
742
2.88k
      sp->parameter[espeakEMPHASIS] = value;
743
2.88k
    }
744
2.88k
    ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
745
2.88k
    break;
746
57
  case SSML_STYLE + SSML_CLOSE:
747
461
  case SSML_PROSODY + SSML_CLOSE:
748
1.23k
  case SSML_EMPHASIS + SSML_CLOSE:
749
1.23k
    PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
750
1.23k
    break;
751
63
  case SSML_PHONEME:
752
63
    attr1 = GetSsmlAttribute(px, "alphabet");
753
63
    attr2 = GetSsmlAttribute(px, "ph");
754
63
    value = attrlookup(attr1, mnem_phoneme_alphabet);
755
63
    if (value == 1) { // alphabet="espeak"
756
0
      outbuf[(*outix)++] = '[';
757
0
      outbuf[(*outix)++] = '[';
758
0
      *outix += attrcopy_utf8(&outbuf[*outix], attr2, n_outbuf-*outix);
759
0
      outbuf[(*outix)++] = ']';
760
0
      outbuf[(*outix)++] = ']';
761
0
    }
762
63
    break;
763
276
  case SSML_SAYAS:
764
276
    attr1 = GetSsmlAttribute(px, "interpret-as");
765
276
    attr2 = GetSsmlAttribute(px, "format");
766
276
    attr3 = GetSsmlAttribute(px, "detail");
767
276
    value = attrlookup(attr1, mnem_interpret_as);
768
276
    value2 = attrlookup(attr2, mnem_sayas_format);
769
276
    if (value2 == 1)
770
0
      value = SAYAS_GLYPHS;
771
772
276
    value3 = attrnumber(attr3, 0, 0);
773
774
276
    if (value == SAYAS_DIGITS) {
775
0
      if (value3 <= 1)
776
0
        value = SAYAS_DIGITS1;
777
0
      else
778
0
        value = SAYAS_DIGITS + value3;
779
0
    }
780
781
276
    sprintf(buf, "%c%dY", CTRL_EMBEDDED, value);
782
276
    strcpy(&outbuf[*outix], buf);
783
276
    *outix += strlen(buf);
784
785
276
    *sayas_start = *outix;
786
276
    *sayas_mode = value; // punctuation doesn't end clause during SAY-AS
787
276
    break;
788
15
  case SSML_SAYAS + SSML_CLOSE:
789
15
    if (*sayas_mode == SAYAS_KEY) {
790
0
      outbuf[*outix] = 0;
791
0
      ReplaceKeyName(outbuf, *sayas_start, outix);
792
0
    }
793
794
15
    outbuf[(*outix)++] = CTRL_EMBEDDED;
795
15
    outbuf[(*outix)++] = 'Y';
796
15
    *sayas_mode = 0;
797
15
    break;
798
324
  case SSML_SUB:
799
324
    if ((attr1 = GetSsmlAttribute(px, "alias")) != NULL) {
800
      // use the alias  rather than the text
801
0
      *ignore_text = true;
802
0
      *outix += attrcopy_utf8(&outbuf[*outix], attr1, n_outbuf-*outix);
803
0
    }
804
324
    break;
805
8.00k
  case SSML_IGNORE_TEXT:
806
8.00k
    *ignore_text = true;
807
8.00k
    break;
808
312
  case SSML_SUB + SSML_CLOSE:
809
879
  case SSML_IGNORE_TEXT + SSML_CLOSE:
810
879
    *ignore_text = false;
811
879
    break;
812
488
  case SSML_MARK:
813
488
    if ((attr1 = GetSsmlAttribute(px, "name")) != NULL) {
814
      // add name to circular buffer of marker names
815
0
      attrcopy_utf8(buf, attr1, sizeof(buf));
816
817
0
      if ((buf[0] != 0) && (strcmp(skip_marker, buf) == 0)) {
818
        // This is the marker we are waiting for before starting to speak
819
0
        *clear_skipping_text = true;
820
0
        skip_marker[0] = 0;
821
0
        return CLAUSE_NONE;
822
0
      }
823
824
0
      if ((index = AddNameData(buf, 0)) >= 0) {
825
0
        sprintf(buf, "%c%dM", CTRL_EMBEDDED, index);
826
0
        strcpy(&outbuf[*outix], buf);
827
0
        *outix += strlen(buf);
828
0
      }
829
0
    }
830
488
    break;
831
1.38k
  case SSML_AUDIO:
832
1.38k
    sp = PushParamStack(tag_type, n_param_stack, (PARAM_STACK *)param_stack);
833
834
1.38k
    if ((attr1 = GetSsmlAttribute(px, "src")) != NULL) {
835
0
      attrcopy_utf8(buf, attr1, sizeof(buf));
836
837
0
      if (uri_callback == NULL) {
838
0
        if ((xmlbase != NULL) && (buf[0] != '/')) {
839
0
          char fname[256];
840
0
          sprintf(fname, "%s/%s", xmlbase, buf);
841
0
          index = LoadSoundFile2(fname);
842
0
        } else
843
0
          index = LoadSoundFile2(buf);
844
0
        if (index >= 0) {
845
0
          sprintf(buf, "%c%dI", CTRL_EMBEDDED, index);
846
0
          strcpy(&outbuf[*outix], buf);
847
0
          *outix += strlen(buf);
848
0
          sp->parameter[espeakSILENCE] = 1;
849
0
        }
850
0
      } else {
851
0
        if ((index = AddNameData(buf, 0)) >= 0) {
852
0
          char *uri;
853
0
          uri = &namedata[index];
854
0
          if (uri_callback(1, uri, xmlbase) == 0) {
855
0
            sprintf(buf, "%c%dU", CTRL_EMBEDDED, index);
856
0
            strcpy(&outbuf[*outix], buf);
857
0
            *outix += strlen(buf);
858
0
            sp->parameter[espeakSILENCE] = 1;
859
0
          }
860
0
        }
861
0
      }
862
0
    }
863
1.38k
    ProcessParamStack(outbuf, outix, *n_param_stack, param_stack, speech_parameters);
864
865
1.38k
    if (self_closing)
866
701
      PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
867
683
    else
868
683
      *audio_text = true;
869
1.38k
    return CLAUSE_NONE;
870
533
  case SSML_AUDIO + SSML_CLOSE:
871
533
    PopParamStack(tag_type, outbuf, outix, n_param_stack, (PARAM_STACK *) param_stack, (int *) speech_parameters);
872
533
    *audio_text = false;
873
533
    return CLAUSE_NONE;
874
50
  case SSML_BREAK:
875
50
    value = 21;
876
50
    terminator = CLAUSE_NONE;
877
878
50
    if ((attr1 = GetSsmlAttribute(px, "strength")) != NULL) {
879
0
      static const int break_value[6] = { 0, 7, 14, 21, 40, 80 }; // *10mS
880
0
      value = attrlookup(attr1, mnem_break);
881
0
      if (value < 0) value = 2;
882
0
      if (value < 3) {
883
        // adjust prepause on the following word
884
0
        sprintf(&outbuf[*outix], "%c%dB", CTRL_EMBEDDED, value);
885
0
        *outix += 3;
886
0
        terminator = 0;
887
0
      }
888
0
      value = break_value[value];
889
0
    }
890
50
    if ((attr2 = GetSsmlAttribute(px, "time")) != NULL) {
891
0
      value2 = attrnumber(attr2, 0, 1);   // pause in mS
892
893
0
      value2 = value2 * speech_parameters[espeakSSML_BREAK_MUL] / 100;
894
895
0
      int wpm = speech_parameters[espeakRATE];
896
0
      espeak_SetParameter(espeakRATE, wpm, 0);
897
898
0
      #if USE_LIBSONIC
899
0
      if (wpm >= espeakRATE_MAXIMUM) {
900
        // Compensate speedup with libsonic, see function SetSpeed()
901
0
        double sonic = ((double)wpm)/espeakRATE_NORMAL;
902
0
        value2 = value2 * sonic;
903
0
      }
904
0
      #endif
905
906
      // compensate for speaking speed to keep constant pause length, see function PauseLength()
907
      // 'value' here is x 10mS
908
0
      value = (value2 * 256) / (speed.clause_pause_factor * 10);
909
0
      if (value < 200)
910
0
        value = (value2 * 256) / (speed.pause_factor * 10);
911
912
0
      if (terminator == 0)
913
0
        terminator = CLAUSE_NONE;
914
0
    }
915
50
    if (terminator) {
916
50
      if (value > 0xfff) {
917
        // scale down the value and set a scaling indicator bit
918
0
        value = value / 32;
919
0
        if (value > 0xfff)
920
0
          value = 0xfff;
921
0
        terminator |= CLAUSE_PAUSE_LONG;
922
0
      }
923
50
      return terminator + value;
924
50
    }
925
0
    break;
926
179
  case SSML_SPEAK:
927
179
    if ((attr1 = GetSsmlAttribute(px, "xml:base")) != NULL) {
928
0
      attrcopy_utf8(buf, attr1, sizeof(buf));
929
0
      if ((index = AddNameData(buf, 0)) >= 0)
930
0
        xmlbase = &namedata[index];
931
0
    }
932
179
    if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
933
179
      return 0; // no voice change
934
0
    return CLAUSE_VOICE;
935
3.96k
  case SSML_VOICE:
936
3.96k
    if (GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) == 0)
937
3.71k
      return 0; // no voice change
938
248
    return CLAUSE_VOICE;
939
188
  case SSML_SPEAK + SSML_CLOSE:
940
    // unwind stack until the previous <voice> or <speak> tag
941
188
    while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_SPEAK))
942
0
      (*n_ssml_stack)--;
943
188
    return CLAUSE_PERIOD + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
944
177
  case SSML_VOICE + SSML_CLOSE:
945
    // unwind stack until the previous <voice> or <speak> tag
946
177
    while ((*n_ssml_stack > 1) && (ssml_stack[*n_ssml_stack-1].tag_type != SSML_VOICE))
947
0
      (*n_ssml_stack)--;
948
949
177
    terminator = 0; // ??  Sentence intonation, but no pause ??
950
177
    return terminator + GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
951
353
  case HTML_BREAK:
952
538
  case HTML_BREAK + SSML_CLOSE:
953
538
    return CLAUSE_COLON;
954
549
  case SSML_SENTENCE:
955
549
    if (ssml_sp->tag_type == SSML_SENTENCE) {
956
      // new sentence implies end-of-sentence
957
0
      voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
958
0
    }
959
549
    voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
960
549
    return CLAUSE_PARAGRAPH + voice_change_flag;
961
13.8k
  case SSML_PARAGRAPH:
962
13.8k
    if (ssml_sp->tag_type == SSML_SENTENCE) {
963
      // new paragraph implies end-of-sentence or end-of-paragraph
964
0
      voice_change_flag = GetVoiceAttributes(px, SSML_SENTENCE+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
965
0
    }
966
13.8k
    if (ssml_sp->tag_type == SSML_PARAGRAPH) {
967
      // new paragraph implies end-of-sentence or end-of-paragraph
968
0
      voice_change_flag |= GetVoiceAttributes(px, SSML_PARAGRAPH+SSML_CLOSE, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
969
0
    }
970
13.8k
    voice_change_flag |= GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
971
13.8k
    return CLAUSE_PARAGRAPH + voice_change_flag;
972
274
  case SSML_SENTENCE + SSML_CLOSE:
973
274
    if (ssml_sp->tag_type == SSML_SENTENCE) {
974
      // end of a sentence which specified a language
975
0
      voice_change_flag = GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name);
976
0
    }
977
274
    return CLAUSE_PERIOD + voice_change_flag;
978
546
  case SSML_PARAGRAPH + SSML_CLOSE:
979
546
    if ((ssml_sp->tag_type == SSML_SENTENCE) || (ssml_sp->tag_type == SSML_PARAGRAPH)) {
980
      // End of a paragraph which specified a language.
981
      // (End-of-paragraph also implies end-of-sentence)
982
0
      return GetVoiceAttributes(px, tag_type, ssml_sp, ssml_stack, *n_ssml_stack, current_voice_id, base_voice, base_voice_variant_name) + CLAUSE_PARAGRAPH;
983
0
    }
984
546
    return CLAUSE_PARAGRAPH;
985
207k
  }
986
185k
  return 0;
987
207k
}
988
989
#pragma GCC visibility push(default)
990
ESPEAK_API void espeak_SetUriCallback(int (*UriCallback)(int, const char *, const char *))
991
0
{
992
0
  uri_callback = UriCallback;
993
0
}
994
#pragma GCC visibility pop
995
996
static const MNEM_TAB xml_entity_mnemonics[] = {
997
  { "gt",   '>' },
998
  { "lt",   0xe000 + '<' },   // private usage area, to avoid confusion with XML tag
999
  { "amp",  '&' },
1000
  { "quot", '"' },
1001
  { "nbsp", ' ' },
1002
  { "apos", '\'' },
1003
  { NULL,   -1 }
1004
};
1005
1006
5.36k
int ParseSsmlReference(char *ref, int *c1, int *c2) {
1007
  // Check if buffer *ref contains an XML character or entity reference
1008
  // if found, set *c1 to the replacement char
1009
  // change *c2 for entity references
1010
  // returns >= 0 on success
1011
1012
5.36k
  if (ref[0] == '#') {
1013
    // character reference
1014
4.14k
    if (ref[1] == 'x')
1015
345
      return sscanf(&ref[2], "%x", (unsigned int *)c1);
1016
3.79k
    else
1017
3.79k
      return sscanf(&ref[1], "%d", c1);
1018
4.14k
  } else { 
1019
    // entity reference
1020
1.22k
    int found;
1021
1.22k
    if ((found = LookupMnem(xml_entity_mnemonics, ref)) != -1) {
1022
619
      *c1 = found;
1023
619
      if (*c2 == 0)
1024
6
        *c2 = ' ';
1025
619
      return found;
1026
619
    }
1027
1.22k
  }
1028
602
  return -1;
1029
5.36k
}