Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/backend/tsearch/dict_thesaurus.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * dict_thesaurus.c
4
 *    Thesaurus dictionary: phrase to phrase substitution
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 *
8
 *
9
 * IDENTIFICATION
10
 *    src/backend/tsearch/dict_thesaurus.c
11
 *
12
 *-------------------------------------------------------------------------
13
 */
14
#include "postgres.h"
15
16
#include "catalog/namespace.h"
17
#include "commands/defrem.h"
18
#include "tsearch/ts_cache.h"
19
#include "tsearch/ts_locale.h"
20
#include "tsearch/ts_public.h"
21
#include "utils/fmgrprotos.h"
22
#include "utils/regproc.h"
23
24
25
/*
26
 * Temporary we use TSLexeme.flags for inner use...
27
 */
28
0
#define DT_USEASIS    0x1000
29
30
typedef struct LexemeInfo
31
{
32
  uint32    idsubst;    /* entry's number in DictThesaurus->subst */
33
  uint16    posinsubst;   /* pos info in entry */
34
  uint16    tnvariant;    /* total num lexemes in one variant */
35
  struct LexemeInfo *nextentry;
36
  struct LexemeInfo *nextvariant;
37
} LexemeInfo;
38
39
typedef struct
40
{
41
  char     *lexeme;
42
  LexemeInfo *entries;
43
} TheLexeme;
44
45
typedef struct
46
{
47
  uint16    lastlexeme;   /* number lexemes to substitute */
48
  uint16    reslen;
49
  TSLexeme   *res;      /* prepared substituted result */
50
} TheSubstitute;
51
52
typedef struct
53
{
54
  /* subdictionary to normalize lexemes */
55
  Oid     subdictOid;
56
  TSDictionaryCacheEntry *subdict;
57
58
  /* Array to search lexeme by exact match */
59
  TheLexeme  *wrds;
60
  int     nwrds;      /* current number of words */
61
  int     ntwrds;     /* allocated array length */
62
63
  /*
64
   * Storage of substituted result, n-th element is for n-th expression
65
   */
66
  TheSubstitute *subst;
67
  int     nsubst;
68
} DictThesaurus;
69
70
71
static void
72
newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
73
0
{
74
0
  TheLexeme  *ptr;
75
76
0
  if (d->nwrds >= d->ntwrds)
77
0
  {
78
0
    if (d->ntwrds == 0)
79
0
    {
80
0
      d->ntwrds = 16;
81
0
      d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
82
0
    }
83
0
    else
84
0
    {
85
0
      d->ntwrds *= 2;
86
0
      d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
87
0
    }
88
0
  }
89
90
0
  ptr = d->wrds + d->nwrds;
91
0
  d->nwrds++;
92
93
0
  ptr->lexeme = palloc(e - b + 1);
94
95
0
  memcpy(ptr->lexeme, b, e - b);
96
0
  ptr->lexeme[e - b] = '\0';
97
98
0
  ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
99
100
0
  ptr->entries->nextentry = NULL;
101
0
  ptr->entries->idsubst = idsubst;
102
0
  ptr->entries->posinsubst = posinsubst;
103
0
}
104
105
static void
106
addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
107
0
{
108
0
  static int  nres = 0;
109
0
  static int  ntres = 0;
110
0
  TheSubstitute *ptr;
111
112
0
  if (nwrd == 0)
113
0
  {
114
0
    nres = ntres = 0;
115
116
0
    if (idsubst >= d->nsubst)
117
0
    {
118
0
      if (d->nsubst == 0)
119
0
      {
120
0
        d->nsubst = 16;
121
0
        d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
122
0
      }
123
0
      else
124
0
      {
125
0
        d->nsubst *= 2;
126
0
        d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
127
0
      }
128
0
    }
129
0
  }
130
131
0
  ptr = d->subst + idsubst;
132
133
0
  ptr->lastlexeme = posinsubst - 1;
134
135
0
  if (nres + 1 >= ntres)
136
0
  {
137
0
    if (ntres == 0)
138
0
    {
139
0
      ntres = 2;
140
0
      ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
141
0
    }
142
0
    else
143
0
    {
144
0
      ntres *= 2;
145
0
      ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
146
0
    }
147
0
  }
148
149
0
  ptr->res[nres].lexeme = palloc(e - b + 1);
150
0
  memcpy(ptr->res[nres].lexeme, b, e - b);
151
0
  ptr->res[nres].lexeme[e - b] = '\0';
152
153
0
  ptr->res[nres].nvariant = nwrd;
154
0
  if (useasis)
155
0
    ptr->res[nres].flags = DT_USEASIS;
156
0
  else
157
0
    ptr->res[nres].flags = 0;
158
159
0
  ptr->res[++nres].lexeme = NULL;
160
0
}
161
162
0
#define TR_WAITLEX  1
163
0
#define TR_INLEX  2
164
0
#define TR_WAITSUBS 3
165
0
#define TR_INSUBS 4
166
167
static void
168
thesaurusRead(const char *filename, DictThesaurus *d)
169
0
{
170
0
  tsearch_readline_state trst;
171
0
  uint32    idsubst = 0;
172
0
  bool    useasis = false;
173
0
  char     *line;
174
175
0
  filename = get_tsearch_config_filename(filename, "ths");
176
0
  if (!tsearch_readline_begin(&trst, filename))
177
0
    ereport(ERROR,
178
0
        (errcode(ERRCODE_CONFIG_FILE_ERROR),
179
0
         errmsg("could not open thesaurus file \"%s\": %m",
180
0
            filename)));
181
182
0
  while ((line = tsearch_readline(&trst)) != NULL)
183
0
  {
184
0
    char     *ptr;
185
0
    int     state = TR_WAITLEX;
186
0
    char     *beginwrd = NULL;
187
0
    uint32    posinsubst = 0;
188
0
    uint32    nwrd = 0;
189
190
0
    ptr = line;
191
192
    /* is it a comment? */
193
0
    while (*ptr && isspace((unsigned char) *ptr))
194
0
      ptr += pg_mblen(ptr);
195
196
0
    if (t_iseq(ptr, '#') || *ptr == '\0' ||
197
0
      t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
198
0
    {
199
0
      pfree(line);
200
0
      continue;
201
0
    }
202
203
0
    while (*ptr)
204
0
    {
205
0
      if (state == TR_WAITLEX)
206
0
      {
207
0
        if (t_iseq(ptr, ':'))
208
0
        {
209
0
          if (posinsubst == 0)
210
0
            ereport(ERROR,
211
0
                (errcode(ERRCODE_CONFIG_FILE_ERROR),
212
0
                 errmsg("unexpected delimiter")));
213
0
          state = TR_WAITSUBS;
214
0
        }
215
0
        else if (!isspace((unsigned char) *ptr))
216
0
        {
217
0
          beginwrd = ptr;
218
0
          state = TR_INLEX;
219
0
        }
220
0
      }
221
0
      else if (state == TR_INLEX)
222
0
      {
223
0
        if (t_iseq(ptr, ':'))
224
0
        {
225
0
          newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
226
0
          state = TR_WAITSUBS;
227
0
        }
228
0
        else if (isspace((unsigned char) *ptr))
229
0
        {
230
0
          newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
231
0
          state = TR_WAITLEX;
232
0
        }
233
0
      }
234
0
      else if (state == TR_WAITSUBS)
235
0
      {
236
0
        if (t_iseq(ptr, '*'))
237
0
        {
238
0
          useasis = true;
239
0
          state = TR_INSUBS;
240
0
          beginwrd = ptr + pg_mblen(ptr);
241
0
        }
242
0
        else if (t_iseq(ptr, '\\'))
243
0
        {
244
0
          useasis = false;
245
0
          state = TR_INSUBS;
246
0
          beginwrd = ptr + pg_mblen(ptr);
247
0
        }
248
0
        else if (!isspace((unsigned char) *ptr))
249
0
        {
250
0
          useasis = false;
251
0
          beginwrd = ptr;
252
0
          state = TR_INSUBS;
253
0
        }
254
0
      }
255
0
      else if (state == TR_INSUBS)
256
0
      {
257
0
        if (isspace((unsigned char) *ptr))
258
0
        {
259
0
          if (ptr == beginwrd)
260
0
            ereport(ERROR,
261
0
                (errcode(ERRCODE_CONFIG_FILE_ERROR),
262
0
                 errmsg("unexpected end of line or lexeme")));
263
0
          addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
264
0
          state = TR_WAITSUBS;
265
0
        }
266
0
      }
267
0
      else
268
0
        elog(ERROR, "unrecognized thesaurus state: %d", state);
269
270
0
      ptr += pg_mblen(ptr);
271
0
    }
272
273
0
    if (state == TR_INSUBS)
274
0
    {
275
0
      if (ptr == beginwrd)
276
0
        ereport(ERROR,
277
0
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
278
0
             errmsg("unexpected end of line or lexeme")));
279
0
      addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
280
0
    }
281
282
0
    idsubst++;
283
284
0
    if (!(nwrd && posinsubst))
285
0
      ereport(ERROR,
286
0
          (errcode(ERRCODE_CONFIG_FILE_ERROR),
287
0
           errmsg("unexpected end of line")));
288
289
0
    if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
290
0
      ereport(ERROR,
291
0
          (errcode(ERRCODE_CONFIG_FILE_ERROR),
292
0
           errmsg("too many lexemes in thesaurus entry")));
293
294
0
    pfree(line);
295
0
  }
296
297
0
  d->nsubst = idsubst;
298
299
0
  tsearch_readline_end(&trst);
300
0
}
301
302
static TheLexeme *
303
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
304
0
{
305
0
  if (*nnw >= *tnm)
306
0
  {
307
0
    *tnm *= 2;
308
0
    newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
309
0
  }
310
311
0
  newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
312
313
0
  if (lexeme && lexeme->lexeme)
314
0
  {
315
0
    newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
316
0
    newwrds[*nnw].entries->tnvariant = tnvariant;
317
0
  }
318
0
  else
319
0
  {
320
0
    newwrds[*nnw].lexeme = NULL;
321
0
    newwrds[*nnw].entries->tnvariant = 1;
322
0
  }
323
324
0
  newwrds[*nnw].entries->idsubst = src->idsubst;
325
0
  newwrds[*nnw].entries->posinsubst = src->posinsubst;
326
327
0
  newwrds[*nnw].entries->nextentry = NULL;
328
329
0
  (*nnw)++;
330
0
  return newwrds;
331
0
}
332
333
static int
334
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
335
0
{
336
0
  if (a == NULL || b == NULL)
337
0
    return 0;
338
339
0
  if (a->idsubst == b->idsubst)
340
0
  {
341
0
    if (a->posinsubst == b->posinsubst)
342
0
    {
343
0
      if (a->tnvariant == b->tnvariant)
344
0
        return 0;
345
346
0
      return (a->tnvariant > b->tnvariant) ? 1 : -1;
347
0
    }
348
349
0
    return (a->posinsubst > b->posinsubst) ? 1 : -1;
350
0
  }
351
352
0
  return (a->idsubst > b->idsubst) ? 1 : -1;
353
0
}
354
355
static int
356
cmpLexeme(const TheLexeme *a, const TheLexeme *b)
357
0
{
358
0
  if (a->lexeme == NULL)
359
0
  {
360
0
    if (b->lexeme == NULL)
361
0
      return 0;
362
0
    else
363
0
      return 1;
364
0
  }
365
0
  else if (b->lexeme == NULL)
366
0
    return -1;
367
368
0
  return strcmp(a->lexeme, b->lexeme);
369
0
}
370
371
static int
372
cmpLexemeQ(const void *a, const void *b)
373
0
{
374
0
  return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
375
0
}
376
377
static int
378
cmpTheLexeme(const void *a, const void *b)
379
0
{
380
0
  const TheLexeme *la = (const TheLexeme *) a;
381
0
  const TheLexeme *lb = (const TheLexeme *) b;
382
0
  int     res;
383
384
0
  if ((res = cmpLexeme(la, lb)) != 0)
385
0
    return res;
386
387
0
  return -cmpLexemeInfo(la->entries, lb->entries);
388
0
}
389
390
static void
391
compileTheLexeme(DictThesaurus *d)
392
0
{
393
0
  int     i,
394
0
        nnw = 0,
395
0
        tnm = 16;
396
0
  TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
397
0
         *ptrwrds;
398
399
0
  for (i = 0; i < d->nwrds; i++)
400
0
  {
401
0
    TSLexeme   *ptr;
402
403
0
    if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
404
0
      newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
405
0
    else
406
0
    {
407
0
      ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
408
0
                               PointerGetDatum(d->subdict->dictData),
409
0
                               PointerGetDatum(d->wrds[i].lexeme),
410
0
                               Int32GetDatum(strlen(d->wrds[i].lexeme)),
411
0
                               PointerGetDatum(NULL)));
412
413
0
      if (!ptr)
414
0
        ereport(ERROR,
415
0
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
416
0
             errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
417
0
                d->wrds[i].lexeme,
418
0
                d->wrds[i].entries->idsubst + 1)));
419
0
      else if (!(ptr->lexeme))
420
0
        ereport(ERROR,
421
0
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
422
0
             errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
423
0
                d->wrds[i].lexeme,
424
0
                d->wrds[i].entries->idsubst + 1),
425
0
             errhint("Use \"?\" to represent a stop word within a sample phrase.")));
426
0
      else
427
0
      {
428
0
        while (ptr->lexeme)
429
0
        {
430
0
          TSLexeme   *remptr = ptr + 1;
431
0
          int     tnvar = 1;
432
0
          int     curvar = ptr->nvariant;
433
434
          /* compute n words in one variant */
435
0
          while (remptr->lexeme)
436
0
          {
437
0
            if (remptr->nvariant != (remptr - 1)->nvariant)
438
0
              break;
439
0
            tnvar++;
440
0
            remptr++;
441
0
          }
442
443
0
          remptr = ptr;
444
0
          while (remptr->lexeme && remptr->nvariant == curvar)
445
0
          {
446
0
            newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
447
0
            remptr++;
448
0
          }
449
450
0
          ptr = remptr;
451
0
        }
452
0
      }
453
0
    }
454
455
0
    pfree(d->wrds[i].lexeme);
456
0
    pfree(d->wrds[i].entries);
457
0
  }
458
459
0
  if (d->wrds)
460
0
    pfree(d->wrds);
461
0
  d->wrds = newwrds;
462
0
  d->nwrds = nnw;
463
0
  d->ntwrds = tnm;
464
465
0
  if (d->nwrds > 1)
466
0
  {
467
0
    qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
468
469
    /* uniq */
470
0
    newwrds = d->wrds;
471
0
    ptrwrds = d->wrds + 1;
472
0
    while (ptrwrds - d->wrds < d->nwrds)
473
0
    {
474
0
      if (cmpLexeme(ptrwrds, newwrds) == 0)
475
0
      {
476
0
        if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
477
0
        {
478
0
          ptrwrds->entries->nextentry = newwrds->entries;
479
0
          newwrds->entries = ptrwrds->entries;
480
0
        }
481
0
        else
482
0
          pfree(ptrwrds->entries);
483
484
0
        if (ptrwrds->lexeme)
485
0
          pfree(ptrwrds->lexeme);
486
0
      }
487
0
      else
488
0
      {
489
0
        newwrds++;
490
0
        *newwrds = *ptrwrds;
491
0
      }
492
493
0
      ptrwrds++;
494
0
    }
495
496
0
    d->nwrds = newwrds - d->wrds + 1;
497
0
    d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
498
0
  }
499
0
}
500
501
static void
502
compileTheSubstitute(DictThesaurus *d)
503
0
{
504
0
  int     i;
505
506
0
  for (i = 0; i < d->nsubst; i++)
507
0
  {
508
0
    TSLexeme   *rem = d->subst[i].res,
509
0
           *outptr,
510
0
           *inptr;
511
0
    int     n = 2;
512
513
0
    outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
514
0
    outptr->lexeme = NULL;
515
0
    inptr = rem;
516
517
0
    while (inptr && inptr->lexeme)
518
0
    {
519
0
      TSLexeme   *lexized,
520
0
            tmplex[2];
521
522
0
      if (inptr->flags & DT_USEASIS)
523
0
      {         /* do not lexize */
524
0
        tmplex[0] = *inptr;
525
0
        tmplex[0].flags = 0;
526
0
        tmplex[1].lexeme = NULL;
527
0
        lexized = tmplex;
528
0
      }
529
0
      else
530
0
      {
531
0
        lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
532
0
                                   PointerGetDatum(d->subdict->dictData),
533
0
                                   PointerGetDatum(inptr->lexeme),
534
0
                                   Int32GetDatum(strlen(inptr->lexeme)),
535
0
                                   PointerGetDatum(NULL)));
536
0
      }
537
538
0
      if (lexized && lexized->lexeme)
539
0
      {
540
0
        int     toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
541
542
0
        while (lexized->lexeme)
543
0
        {
544
0
          if (outptr - d->subst[i].res + 1 >= n)
545
0
          {
546
0
            int     diff = outptr - d->subst[i].res;
547
548
0
            n *= 2;
549
0
            d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
550
0
            outptr = d->subst[i].res + diff;
551
0
          }
552
553
0
          *outptr = *lexized;
554
0
          outptr->lexeme = pstrdup(lexized->lexeme);
555
556
0
          outptr++;
557
0
          lexized++;
558
0
        }
559
560
0
        if (toset > 0)
561
0
          d->subst[i].res[toset].flags |= TSL_ADDPOS;
562
0
      }
563
0
      else if (lexized)
564
0
      {
565
0
        ereport(ERROR,
566
0
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
567
0
             errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568
0
                inptr->lexeme, i + 1)));
569
0
      }
570
0
      else
571
0
      {
572
0
        ereport(ERROR,
573
0
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
574
0
             errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575
0
                inptr->lexeme, i + 1)));
576
0
      }
577
578
0
      if (inptr->lexeme)
579
0
        pfree(inptr->lexeme);
580
0
      inptr++;
581
0
    }
582
583
0
    if (outptr == d->subst[i].res)
584
0
      ereport(ERROR,
585
0
          (errcode(ERRCODE_CONFIG_FILE_ERROR),
586
0
           errmsg("thesaurus substitute phrase is empty (rule %d)",
587
0
              i + 1)));
588
589
0
    d->subst[i].reslen = outptr - d->subst[i].res;
590
591
0
    pfree(rem);
592
0
  }
593
0
}
594
595
Datum
596
thesaurus_init(PG_FUNCTION_ARGS)
597
0
{
598
0
  List     *dictoptions = (List *) PG_GETARG_POINTER(0);
599
0
  DictThesaurus *d;
600
0
  char     *subdictname = NULL;
601
0
  bool    fileloaded = false;
602
0
  List     *namelist;
603
0
  ListCell   *l;
604
605
0
  d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
606
607
0
  foreach(l, dictoptions)
608
0
  {
609
0
    DefElem    *defel = (DefElem *) lfirst(l);
610
611
0
    if (strcmp(defel->defname, "dictfile") == 0)
612
0
    {
613
0
      if (fileloaded)
614
0
        ereport(ERROR,
615
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
616
0
             errmsg("multiple DictFile parameters")));
617
0
      thesaurusRead(defGetString(defel), d);
618
0
      fileloaded = true;
619
0
    }
620
0
    else if (strcmp(defel->defname, "dictionary") == 0)
621
0
    {
622
0
      if (subdictname)
623
0
        ereport(ERROR,
624
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
625
0
             errmsg("multiple Dictionary parameters")));
626
0
      subdictname = pstrdup(defGetString(defel));
627
0
    }
628
0
    else
629
0
    {
630
0
      ereport(ERROR,
631
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
632
0
           errmsg("unrecognized Thesaurus parameter: \"%s\"",
633
0
              defel->defname)));
634
0
    }
635
0
  }
636
637
0
  if (!fileloaded)
638
0
    ereport(ERROR,
639
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
640
0
         errmsg("missing DictFile parameter")));
641
0
  if (!subdictname)
642
0
    ereport(ERROR,
643
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
644
0
         errmsg("missing Dictionary parameter")));
645
646
0
  namelist = stringToQualifiedNameList(subdictname, NULL);
647
0
  d->subdictOid = get_ts_dict_oid(namelist, false);
648
0
  d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
649
650
0
  compileTheLexeme(d);
651
0
  compileTheSubstitute(d);
652
653
0
  PG_RETURN_POINTER(d);
654
0
}
655
656
static LexemeInfo *
657
findTheLexeme(DictThesaurus *d, char *lexeme)
658
0
{
659
0
  TheLexeme key,
660
0
         *res;
661
662
0
  if (d->nwrds == 0)
663
0
    return NULL;
664
665
0
  key.lexeme = lexeme;
666
0
  key.entries = NULL;
667
668
0
  res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
669
670
0
  if (res == NULL)
671
0
    return NULL;
672
0
  return res->entries;
673
0
}
674
675
static bool
676
matchIdSubst(LexemeInfo *stored, uint32 idsubst)
677
0
{
678
0
  bool    res = true;
679
680
0
  if (stored)
681
0
  {
682
0
    res = false;
683
684
0
    for (; stored; stored = stored->nextvariant)
685
0
      if (stored->idsubst == idsubst)
686
0
      {
687
0
        res = true;
688
0
        break;
689
0
      }
690
0
  }
691
692
0
  return res;
693
0
}
694
695
static LexemeInfo *
696
findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
697
0
{
698
0
  for (;;)
699
0
  {
700
0
    int     i;
701
0
    LexemeInfo *ptr = newin[0];
702
703
0
    for (i = 0; i < newn; i++)
704
0
    {
705
0
      while (newin[i] && newin[i]->idsubst < ptr->idsubst)
706
0
        newin[i] = newin[i]->nextentry;
707
708
0
      if (newin[i] == NULL)
709
0
        return in;
710
711
0
      if (newin[i]->idsubst > ptr->idsubst)
712
0
      {
713
0
        ptr = newin[i];
714
0
        i = -1;
715
0
        continue;
716
0
      }
717
718
0
      while (newin[i]->idsubst == ptr->idsubst)
719
0
      {
720
0
        if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
721
0
        {
722
0
          ptr = newin[i];
723
0
          break;
724
0
        }
725
726
0
        newin[i] = newin[i]->nextentry;
727
0
        if (newin[i] == NULL)
728
0
          return in;
729
0
      }
730
731
0
      if (newin[i]->idsubst != ptr->idsubst)
732
0
      {
733
0
        ptr = newin[i];
734
0
        i = -1;
735
0
        continue;
736
0
      }
737
0
    }
738
739
0
    if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
740
0
    {           /* found */
741
742
0
      ptr->nextvariant = in;
743
0
      in = ptr;
744
0
    }
745
746
    /* step forward */
747
0
    for (i = 0; i < newn; i++)
748
0
      newin[i] = newin[i]->nextentry;
749
0
  }
750
0
}
751
752
static TSLexeme *
753
copyTSLexeme(TheSubstitute *ts)
754
0
{
755
0
  TSLexeme   *res;
756
0
  uint16    i;
757
758
0
  res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
759
0
  for (i = 0; i < ts->reslen; i++)
760
0
  {
761
0
    res[i] = ts->res[i];
762
0
    res[i].lexeme = pstrdup(ts->res[i].lexeme);
763
0
  }
764
765
0
  res[ts->reslen].lexeme = NULL;
766
767
0
  return res;
768
0
}
769
770
static TSLexeme *
771
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
772
0
{
773
0
  *moreres = false;
774
0
  while (info)
775
0
  {
776
0
    Assert(info->idsubst < d->nsubst);
777
0
    if (info->nextvariant)
778
0
      *moreres = true;
779
0
    if (d->subst[info->idsubst].lastlexeme == curpos)
780
0
      return copyTSLexeme(d->subst + info->idsubst);
781
0
    info = info->nextvariant;
782
0
  }
783
784
0
  return NULL;
785
0
}
786
787
Datum
788
thesaurus_lexize(PG_FUNCTION_ARGS)
789
0
{
790
0
  DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
791
0
  DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
792
0
  TSLexeme   *res = NULL;
793
0
  LexemeInfo *stored,
794
0
         *info = NULL;
795
0
  uint16    curpos = 0;
796
0
  bool    moreres = false;
797
798
0
  if (PG_NARGS() != 4 || dstate == NULL)
799
0
    elog(ERROR, "forbidden call of thesaurus or nested call");
800
801
0
  if (dstate->isend)
802
0
    PG_RETURN_POINTER(NULL);
803
0
  stored = (LexemeInfo *) dstate->private_state;
804
805
0
  if (stored)
806
0
    curpos = stored->posinsubst + 1;
807
808
0
  if (!d->subdict->isvalid)
809
0
    d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
810
811
0
  res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
812
0
                           PointerGetDatum(d->subdict->dictData),
813
0
                           PG_GETARG_DATUM(1),
814
0
                           PG_GETARG_DATUM(2),
815
0
                           PointerGetDatum(NULL)));
816
817
0
  if (res && res->lexeme)
818
0
  {
819
0
    TSLexeme   *ptr = res,
820
0
           *basevar;
821
822
0
    while (ptr->lexeme)
823
0
    {
824
0
      uint16    nv = ptr->nvariant;
825
0
      uint16    i,
826
0
            nlex = 0;
827
0
      LexemeInfo **infos;
828
829
0
      basevar = ptr;
830
0
      while (ptr->lexeme && nv == ptr->nvariant)
831
0
      {
832
0
        nlex++;
833
0
        ptr++;
834
0
      }
835
836
0
      infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
837
0
      for (i = 0; i < nlex; i++)
838
0
        if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
839
0
          break;
840
841
0
      if (i < nlex)
842
0
      {
843
        /* no chance to find */
844
0
        pfree(infos);
845
0
        continue;
846
0
      }
847
848
0
      info = findVariant(info, stored, curpos, infos, nlex);
849
0
    }
850
0
  }
851
0
  else if (res)
852
0
  {             /* stop-word */
853
0
    LexemeInfo *infos = findTheLexeme(d, NULL);
854
855
0
    info = findVariant(NULL, stored, curpos, &infos, 1);
856
0
  }
857
0
  else
858
0
  {
859
0
    info = NULL;      /* word isn't recognized */
860
0
  }
861
862
0
  dstate->private_state = info;
863
864
0
  if (!info)
865
0
  {
866
0
    dstate->getnext = false;
867
0
    PG_RETURN_POINTER(NULL);
868
0
  }
869
870
0
  if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
871
0
  {
872
0
    dstate->getnext = moreres;
873
0
    PG_RETURN_POINTER(res);
874
0
  }
875
876
0
  dstate->getnext = true;
877
878
0
  PG_RETURN_POINTER(NULL);
879
0
}