Coverage Report

Created: 2025-08-12 06:43

/src/postgres/src/backend/tsearch/ts_parse.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * ts_parse.c
4
 *    main parse functions for tsearch
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 *
8
 *
9
 * IDENTIFICATION
10
 *    src/backend/tsearch/ts_parse.c
11
 *
12
 *-------------------------------------------------------------------------
13
 */
14
15
#include "postgres.h"
16
17
#include "tsearch/ts_cache.h"
18
#include "tsearch/ts_utils.h"
19
#include "varatt.h"
20
21
#define IGNORE_LONGLEXEME 1
22
23
/*
24
 * Lexize subsystem
25
 */
26
27
typedef struct ParsedLex
28
{
29
  int     type;
30
  char     *lemm;
31
  int     lenlemm;
32
  struct ParsedLex *next;
33
} ParsedLex;
34
35
typedef struct ListParsedLex
36
{
37
  ParsedLex  *head;
38
  ParsedLex  *tail;
39
} ListParsedLex;
40
41
typedef struct
42
{
43
  TSConfigCacheEntry *cfg;
44
  Oid     curDictId;
45
  int     posDict;
46
  DictSubState dictState;
47
  ParsedLex  *curSub;
48
  ListParsedLex towork;   /* current list to work */
49
  ListParsedLex waste;    /* list of lexemes that already lexized */
50
51
  /*
52
   * fields to store last variant to lexize (basically, thesaurus or similar
53
   * to, which wants several lexemes
54
   */
55
56
  ParsedLex  *lastRes;
57
  TSLexeme   *tmpRes;
58
} LexizeData;
59
60
static void
61
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
62
0
{
63
0
  ld->cfg = cfg;
64
0
  ld->curDictId = InvalidOid;
65
0
  ld->posDict = 0;
66
0
  ld->towork.head = ld->towork.tail = ld->curSub = NULL;
67
0
  ld->waste.head = ld->waste.tail = NULL;
68
0
  ld->lastRes = NULL;
69
0
  ld->tmpRes = NULL;
70
0
}
71
72
static void
73
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
74
0
{
75
0
  if (list->tail)
76
0
  {
77
0
    list->tail->next = newpl;
78
0
    list->tail = newpl;
79
0
  }
80
0
  else
81
0
    list->head = list->tail = newpl;
82
0
  newpl->next = NULL;
83
0
}
84
85
static ParsedLex *
86
LPLRemoveHead(ListParsedLex *list)
87
0
{
88
0
  ParsedLex  *res = list->head;
89
90
0
  if (list->head)
91
0
    list->head = list->head->next;
92
93
0
  if (list->head == NULL)
94
0
    list->tail = NULL;
95
96
0
  return res;
97
0
}
98
99
static void
100
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
101
0
{
102
0
  ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
103
104
0
  newpl->type = type;
105
0
  newpl->lemm = lemm;
106
0
  newpl->lenlemm = lenlemm;
107
0
  LPLAddTail(&ld->towork, newpl);
108
0
  ld->curSub = ld->towork.tail;
109
0
}
110
111
static void
112
RemoveHead(LexizeData *ld)
113
0
{
114
0
  LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
115
116
0
  ld->posDict = 0;
117
0
}
118
119
static void
120
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
121
0
{
122
0
  if (correspondLexem)
123
0
  {
124
0
    *correspondLexem = ld->waste.head;
125
0
  }
126
0
  else
127
0
  {
128
0
    ParsedLex  *tmp,
129
0
           *ptr = ld->waste.head;
130
131
0
    while (ptr)
132
0
    {
133
0
      tmp = ptr->next;
134
0
      pfree(ptr);
135
0
      ptr = tmp;
136
0
    }
137
0
  }
138
0
  ld->waste.head = ld->waste.tail = NULL;
139
0
}
140
141
static void
142
moveToWaste(LexizeData *ld, ParsedLex *stop)
143
0
{
144
0
  bool    go = true;
145
146
0
  while (ld->towork.head && go)
147
0
  {
148
0
    if (ld->towork.head == stop)
149
0
    {
150
0
      ld->curSub = stop->next;
151
0
      go = false;
152
0
    }
153
0
    RemoveHead(ld);
154
0
  }
155
0
}
156
157
static void
158
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
159
0
{
160
0
  if (ld->tmpRes)
161
0
  {
162
0
    TSLexeme   *ptr;
163
164
0
    for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
165
0
      pfree(ptr->lexeme);
166
0
    pfree(ld->tmpRes);
167
0
  }
168
0
  ld->tmpRes = res;
169
0
  ld->lastRes = lex;
170
0
}
171
172
static TSLexeme *
173
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
174
0
{
175
0
  int     i;
176
0
  ListDictionary *map;
177
0
  TSDictionaryCacheEntry *dict;
178
0
  TSLexeme   *res;
179
180
0
  if (ld->curDictId == InvalidOid)
181
0
  {
182
    /*
183
     * usual mode: dictionary wants only one word, but we should keep in
184
     * mind that we should go through all stack
185
     */
186
187
0
    while (ld->towork.head)
188
0
    {
189
0
      ParsedLex  *curVal = ld->towork.head;
190
0
      char     *curValLemm = curVal->lemm;
191
0
      int     curValLenLemm = curVal->lenlemm;
192
193
0
      map = ld->cfg->map + curVal->type;
194
195
0
      if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
196
0
      {
197
        /* skip this type of lexeme */
198
0
        RemoveHead(ld);
199
0
        continue;
200
0
      }
201
202
0
      for (i = ld->posDict; i < map->len; i++)
203
0
      {
204
0
        dict = lookup_ts_dictionary_cache(map->dictIds[i]);
205
206
0
        ld->dictState.isend = ld->dictState.getnext = false;
207
0
        ld->dictState.private_state = NULL;
208
0
        res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
209
0
                                 PointerGetDatum(dict->dictData),
210
0
                                 PointerGetDatum(curValLemm),
211
0
                                 Int32GetDatum(curValLenLemm),
212
0
                                 PointerGetDatum(&ld->dictState)));
213
214
0
        if (ld->dictState.getnext)
215
0
        {
216
          /*
217
           * dictionary wants next word, so setup and store current
218
           * position and go to multiword mode
219
           */
220
221
0
          ld->curDictId = map->dictIds[i];
222
0
          ld->posDict = i + 1;
223
0
          ld->curSub = curVal->next;
224
0
          if (res)
225
0
            setNewTmpRes(ld, curVal, res);
226
0
          return LexizeExec(ld, correspondLexem);
227
0
        }
228
229
0
        if (!res)   /* dictionary doesn't know this lexeme */
230
0
          continue;
231
232
0
        if (res->flags & TSL_FILTER)
233
0
        {
234
0
          curValLemm = res->lexeme;
235
0
          curValLenLemm = strlen(res->lexeme);
236
0
          continue;
237
0
        }
238
239
0
        RemoveHead(ld);
240
0
        setCorrLex(ld, correspondLexem);
241
0
        return res;
242
0
      }
243
244
0
      RemoveHead(ld);
245
0
    }
246
0
  }
247
0
  else
248
0
  {             /* curDictId is valid */
249
0
    dict = lookup_ts_dictionary_cache(ld->curDictId);
250
251
    /*
252
     * Dictionary ld->curDictId asks us about following words
253
     */
254
255
0
    while (ld->curSub)
256
0
    {
257
0
      ParsedLex  *curVal = ld->curSub;
258
259
0
      map = ld->cfg->map + curVal->type;
260
261
0
      if (curVal->type != 0)
262
0
      {
263
0
        bool    dictExists = false;
264
265
0
        if (curVal->type >= ld->cfg->lenmap || map->len == 0)
266
0
        {
267
          /* skip this type of lexeme */
268
0
          ld->curSub = curVal->next;
269
0
          continue;
270
0
        }
271
272
        /*
273
         * We should be sure that current type of lexeme is recognized
274
         * by our dictionary: we just check is it exist in list of
275
         * dictionaries ?
276
         */
277
0
        for (i = 0; i < map->len && !dictExists; i++)
278
0
          if (ld->curDictId == map->dictIds[i])
279
0
            dictExists = true;
280
281
0
        if (!dictExists)
282
0
        {
283
          /*
284
           * Dictionary can't work with current type of lexeme,
285
           * return to basic mode and redo all stored lexemes
286
           */
287
0
          ld->curDictId = InvalidOid;
288
0
          return LexizeExec(ld, correspondLexem);
289
0
        }
290
0
      }
291
292
0
      ld->dictState.isend = (curVal->type == 0);
293
0
      ld->dictState.getnext = false;
294
295
0
      res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
296
0
                               PointerGetDatum(dict->dictData),
297
0
                               PointerGetDatum(curVal->lemm),
298
0
                               Int32GetDatum(curVal->lenlemm),
299
0
                               PointerGetDatum(&ld->dictState)));
300
301
0
      if (ld->dictState.getnext)
302
0
      {
303
        /* Dictionary wants one more */
304
0
        ld->curSub = curVal->next;
305
0
        if (res)
306
0
          setNewTmpRes(ld, curVal, res);
307
0
        continue;
308
0
      }
309
310
0
      if (res || ld->tmpRes)
311
0
      {
312
        /*
313
         * Dictionary normalizes lexemes, so we remove from stack all
314
         * used lexemes, return to basic mode and redo end of stack
315
         * (if it exists)
316
         */
317
0
        if (res)
318
0
        {
319
0
          moveToWaste(ld, ld->curSub);
320
0
        }
321
0
        else
322
0
        {
323
0
          res = ld->tmpRes;
324
0
          moveToWaste(ld, ld->lastRes);
325
0
        }
326
327
        /* reset to initial state */
328
0
        ld->curDictId = InvalidOid;
329
0
        ld->posDict = 0;
330
0
        ld->lastRes = NULL;
331
0
        ld->tmpRes = NULL;
332
0
        setCorrLex(ld, correspondLexem);
333
0
        return res;
334
0
      }
335
336
      /*
337
       * Dict don't want next lexem and didn't recognize anything, redo
338
       * from ld->towork.head
339
       */
340
0
      ld->curDictId = InvalidOid;
341
0
      return LexizeExec(ld, correspondLexem);
342
0
    }
343
0
  }
344
345
0
  setCorrLex(ld, correspondLexem);
346
0
  return NULL;
347
0
}
348
349
/*
350
 * Parse string and lexize words.
351
 *
352
 * prs will be filled in.
353
 */
354
void
355
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
356
0
{
357
0
  int     type,
358
0
        lenlemm = 0;  /* silence compiler warning */
359
0
  char     *lemm = NULL;
360
0
  LexizeData  ldata;
361
0
  TSLexeme   *norms;
362
0
  TSConfigCacheEntry *cfg;
363
0
  TSParserCacheEntry *prsobj;
364
0
  void     *prsdata;
365
366
0
  cfg = lookup_ts_config_cache(cfgId);
367
0
  prsobj = lookup_ts_parser_cache(cfg->prsId);
368
369
0
  prsdata = DatumGetPointer(FunctionCall2(&prsobj->prsstart,
370
0
                      PointerGetDatum(buf),
371
0
                      Int32GetDatum(buflen)));
372
373
0
  LexizeInit(&ldata, cfg);
374
375
0
  do
376
0
  {
377
0
    type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
378
0
                       PointerGetDatum(prsdata),
379
0
                       PointerGetDatum(&lemm),
380
0
                       PointerGetDatum(&lenlemm)));
381
382
0
    if (type > 0 && lenlemm >= MAXSTRLEN)
383
0
    {
384
0
#ifdef IGNORE_LONGLEXEME
385
0
      ereport(NOTICE,
386
0
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
387
0
           errmsg("word is too long to be indexed"),
388
0
           errdetail("Words longer than %d characters are ignored.",
389
0
                 MAXSTRLEN)));
390
0
      continue;
391
#else
392
      ereport(ERROR,
393
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
394
           errmsg("word is too long to be indexed"),
395
           errdetail("Words longer than %d characters are ignored.",
396
                 MAXSTRLEN)));
397
#endif
398
0
    }
399
400
0
    LexizeAddLemm(&ldata, type, lemm, lenlemm);
401
402
0
    while ((norms = LexizeExec(&ldata, NULL)) != NULL)
403
0
    {
404
0
      TSLexeme   *ptr = norms;
405
406
0
      prs->pos++;     /* set pos */
407
408
0
      while (ptr->lexeme)
409
0
      {
410
0
        if (prs->curwords == prs->lenwords)
411
0
        {
412
0
          prs->lenwords *= 2;
413
0
          prs->words = (ParsedWord *) repalloc(prs->words, prs->lenwords * sizeof(ParsedWord));
414
0
        }
415
416
0
        if (ptr->flags & TSL_ADDPOS)
417
0
          prs->pos++;
418
0
        prs->words[prs->curwords].len = strlen(ptr->lexeme);
419
0
        prs->words[prs->curwords].word = ptr->lexeme;
420
0
        prs->words[prs->curwords].nvariant = ptr->nvariant;
421
0
        prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
422
0
        prs->words[prs->curwords].alen = 0;
423
0
        prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
424
0
        ptr++;
425
0
        prs->curwords++;
426
0
      }
427
0
      pfree(norms);
428
0
    }
429
0
  } while (type > 0);
430
431
0
  FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
432
0
}
433
434
/*
435
 * Headline framework
436
 */
437
438
/* Add a word to prs->words[] */
439
static void
440
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
441
0
{
442
0
  if (prs->curwords >= prs->lenwords)
443
0
  {
444
0
    prs->lenwords *= 2;
445
0
    prs->words = (HeadlineWordEntry *) repalloc(prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
446
0
  }
447
0
  memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
448
0
  prs->words[prs->curwords].type = (uint8) type;
449
0
  prs->words[prs->curwords].len = buflen;
450
0
  prs->words[prs->curwords].word = palloc(buflen);
451
0
  memcpy(prs->words[prs->curwords].word, buf, buflen);
452
0
  prs->curwords++;
453
0
}
454
455
/*
456
 * Add pos and matching-query-item data to the just-added word.
457
 * Here, buf/buflen represent a processed lexeme, not raw token text.
458
 *
459
 * If the query contains more than one matching item, we replicate
460
 * the last-added word so that each item can be pointed to.  The
461
 * duplicate entries are marked with repeated = 1.
462
 */
463
static void
464
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
465
0
{
466
0
  int     i;
467
0
  QueryItem  *item = GETQUERY(query);
468
0
  HeadlineWordEntry *word;
469
470
0
  while (prs->curwords + query->size >= prs->lenwords)
471
0
  {
472
0
    prs->lenwords *= 2;
473
0
    prs->words = (HeadlineWordEntry *) repalloc(prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
474
0
  }
475
476
0
  word = &(prs->words[prs->curwords - 1]);
477
0
  word->pos = LIMITPOS(pos);
478
0
  for (i = 0; i < query->size; i++)
479
0
  {
480
0
    if (item->type == QI_VAL &&
481
0
      tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
482
0
              buf, buflen, item->qoperand.prefix) == 0)
483
0
    {
484
0
      if (word->item)
485
0
      {
486
0
        memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
487
0
        prs->words[prs->curwords].item = &item->qoperand;
488
0
        prs->words[prs->curwords].repeated = 1;
489
0
        prs->curwords++;
490
0
      }
491
0
      else
492
0
        word->item = &item->qoperand;
493
0
    }
494
0
    item++;
495
0
  }
496
0
}
497
498
static void
499
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
500
0
{
501
0
  ParsedLex  *tmplexs;
502
0
  TSLexeme   *ptr;
503
0
  int32   savedpos;
504
505
0
  while (lexs)
506
0
  {
507
0
    if (lexs->type > 0)
508
0
      hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
509
510
0
    ptr = norms;
511
0
    savedpos = prs->vectorpos;
512
0
    while (ptr && ptr->lexeme)
513
0
    {
514
0
      if (ptr->flags & TSL_ADDPOS)
515
0
        savedpos++;
516
0
      hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
517
0
      ptr++;
518
0
    }
519
520
0
    tmplexs = lexs->next;
521
0
    pfree(lexs);
522
0
    lexs = tmplexs;
523
0
  }
524
525
0
  if (norms)
526
0
  {
527
0
    ptr = norms;
528
0
    while (ptr->lexeme)
529
0
    {
530
0
      if (ptr->flags & TSL_ADDPOS)
531
0
        prs->vectorpos++;
532
0
      pfree(ptr->lexeme);
533
0
      ptr++;
534
0
    }
535
0
    pfree(norms);
536
0
  }
537
0
}
538
539
void
540
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
541
0
{
542
0
  int     type,
543
0
        lenlemm = 0;  /* silence compiler warning */
544
0
  char     *lemm = NULL;
545
0
  LexizeData  ldata;
546
0
  TSLexeme   *norms;
547
0
  ParsedLex  *lexs;
548
0
  TSConfigCacheEntry *cfg;
549
0
  TSParserCacheEntry *prsobj;
550
0
  void     *prsdata;
551
552
0
  cfg = lookup_ts_config_cache(cfgId);
553
0
  prsobj = lookup_ts_parser_cache(cfg->prsId);
554
555
0
  prsdata = DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
556
0
                      PointerGetDatum(buf),
557
0
                      Int32GetDatum(buflen)));
558
559
0
  LexizeInit(&ldata, cfg);
560
561
0
  do
562
0
  {
563
0
    type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
564
0
                       PointerGetDatum(prsdata),
565
0
                       PointerGetDatum(&lemm),
566
0
                       PointerGetDatum(&lenlemm)));
567
568
0
    if (type > 0 && lenlemm >= MAXSTRLEN)
569
0
    {
570
0
#ifdef IGNORE_LONGLEXEME
571
0
      ereport(NOTICE,
572
0
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
573
0
           errmsg("word is too long to be indexed"),
574
0
           errdetail("Words longer than %d characters are ignored.",
575
0
                 MAXSTRLEN)));
576
0
      continue;
577
#else
578
      ereport(ERROR,
579
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
580
           errmsg("word is too long to be indexed"),
581
           errdetail("Words longer than %d characters are ignored.",
582
                 MAXSTRLEN)));
583
#endif
584
0
    }
585
586
0
    LexizeAddLemm(&ldata, type, lemm, lenlemm);
587
588
0
    do
589
0
    {
590
0
      if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
591
0
      {
592
0
        prs->vectorpos++;
593
0
        addHLParsedLex(prs, query, lexs, norms);
594
0
      }
595
0
      else
596
0
        addHLParsedLex(prs, query, lexs, NULL);
597
0
    } while (norms);
598
0
  } while (type > 0);
599
600
0
  FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
601
0
}
602
603
/*
604
 * Generate the headline, as a text object, from HeadlineParsedText.
605
 */
606
text *
607
generateHeadline(HeadlineParsedText *prs)
608
0
{
609
0
  text     *out;
610
0
  char     *ptr;
611
0
  int     len = 128;
612
0
  int     numfragments = 0;
613
0
  int16   infrag = 0;
614
615
0
  HeadlineWordEntry *wrd = prs->words;
616
617
0
  out = (text *) palloc(len);
618
0
  ptr = ((char *) out) + VARHDRSZ;
619
620
0
  while (wrd - prs->words < prs->curwords)
621
0
  {
622
0
    while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
623
0
    {
624
0
      int     dist = ptr - ((char *) out);
625
626
0
      len *= 2;
627
0
      out = (text *) repalloc(out, len);
628
0
      ptr = ((char *) out) + dist;
629
0
    }
630
631
0
    if (wrd->in && !wrd->repeated)
632
0
    {
633
0
      if (!infrag)
634
0
      {
635
636
        /* start of a new fragment */
637
0
        infrag = 1;
638
0
        numfragments++;
639
        /* add a fragment delimiter if this is after the first one */
640
0
        if (numfragments > 1)
641
0
        {
642
0
          memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
643
0
          ptr += prs->fragdelimlen;
644
0
        }
645
0
      }
646
0
      if (wrd->replace)
647
0
      {
648
0
        *ptr = ' ';
649
0
        ptr++;
650
0
      }
651
0
      else if (!wrd->skip)
652
0
      {
653
0
        if (wrd->selected)
654
0
        {
655
0
          memcpy(ptr, prs->startsel, prs->startsellen);
656
0
          ptr += prs->startsellen;
657
0
        }
658
0
        memcpy(ptr, wrd->word, wrd->len);
659
0
        ptr += wrd->len;
660
0
        if (wrd->selected)
661
0
        {
662
0
          memcpy(ptr, prs->stopsel, prs->stopsellen);
663
0
          ptr += prs->stopsellen;
664
0
        }
665
0
      }
666
0
    }
667
0
    else if (!wrd->repeated)
668
0
    {
669
0
      if (infrag)
670
0
        infrag = 0;
671
0
      pfree(wrd->word);
672
0
    }
673
674
0
    wrd++;
675
0
  }
676
677
0
  SET_VARSIZE(out, ptr - ((char *) out));
678
0
  return out;
679
0
}