/src/postgres/src/backend/tsearch/dict_thesaurus.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * dict_thesaurus.c
 *    Thesaurus dictionary: phrase to phrase substitution
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *    src/backend/tsearch/dict_thesaurus.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/fmgrprotos.h"
#include "utils/regproc.h"


/*
 * Temporary we use TSLexeme.flags for inner use...
 */
#define DT_USEASIS    0x1000

typedef struct LexemeInfo
{
  uint32    idsubst;    /* entry's number in DictThesaurus->subst */
  uint16    posinsubst;   /* pos info in entry */
  uint16    tnvariant;    /* total num lexemes in one variant */
  struct LexemeInfo *nextentry;
  struct LexemeInfo *nextvariant;
} LexemeInfo;

typedef struct
{
  char     *lexeme;
  LexemeInfo *entries;
} TheLexeme;

typedef struct
{
  uint16    lastlexeme;   /* number lexemes to substitute */
  uint16    reslen;
  TSLexeme   *res;      /* prepared substituted result */
} TheSubstitute;

typedef struct
{
  /* subdictionary to normalize lexemes */
  Oid     subdictOid;
  TSDictionaryCacheEntry *subdict;

  /* Array to search lexeme by exact match */
  TheLexeme  *wrds;
  int     nwrds;      /* current number of words */
  int     ntwrds;     /* allocated array length */

  /*
   * Storage of substituted result, n-th element is for n-th expression
   */
  TheSubstitute *subst;
  int     nsubst;
} DictThesaurus;


static void
newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
{
  TheLexeme  *ptr;

  if (d->nwrds >= d->ntwrds)
  {
    if (d->ntwrds == 0)
    {
      d->ntwrds = 16;
      d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
    }
    else
    {
      d->ntwrds *= 2;
      d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
    }
  }

  ptr = d->wrds + d->nwrds;
  d->nwrds++;

  ptr->lexeme = palloc(e - b + 1);

  memcpy(ptr->lexeme, b, e - b);
  ptr->lexeme[e - b] = '\0';

  ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));

  ptr->entries->nextentry = NULL;
  ptr->entries->idsubst = idsubst;
  ptr->entries->posinsubst = posinsubst;
}

static void
addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
{
  static int  nres = 0;
  static int  ntres = 0;
  TheSubstitute *ptr;

  if (nwrd == 0)
  {
    nres = ntres = 0;

    if (idsubst >= d->nsubst)
    {
      if (d->nsubst == 0)
      {
        d->nsubst = 16;
        d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
      }
      else
      {
        d->nsubst *= 2;
        d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
      }
    }
  }

  ptr = d->subst + idsubst;

  ptr->lastlexeme = posinsubst - 1;

  if (nres + 1 >= ntres)
  {
    if (ntres == 0)
    {
      ntres = 2;
      ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
    }
    else
    {
      ntres *= 2;
      ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
    }
  }

  ptr->res[nres].lexeme = palloc(e - b + 1);
  memcpy(ptr->res[nres].lexeme, b, e - b);
  ptr->res[nres].lexeme[e - b] = '\0';

  ptr->res[nres].nvariant = nwrd;
  if (useasis)
    ptr->res[nres].flags = DT_USEASIS;
  else
    ptr->res[nres].flags = 0;

  ptr->res[++nres].lexeme = NULL;
}

#define TR_WAITLEX  1
#define TR_INLEX  2
#define TR_WAITSUBS 3
#define TR_INSUBS 4

static void
thesaurusRead(const char *filename, DictThesaurus *d)
{
  tsearch_readline_state trst;
  uint32    idsubst = 0;
  bool    useasis = false;
  char     *line;

  filename = get_tsearch_config_filename(filename, "ths");
  if (!tsearch_readline_begin(&trst, filename))
    ereport(ERROR,
        (errcode(ERRCODE_CONFIG_FILE_ERROR),
         errmsg("could not open thesaurus file \"%s\": %m",
            filename)));

  while ((line = tsearch_readline(&trst)) != NULL)
  {
    char     *ptr;
    int     state = TR_WAITLEX;
    char     *beginwrd = NULL;
    uint32    posinsubst = 0;
    uint32    nwrd = 0;

    ptr = line;

    /* is it a comment? */
    while (*ptr && isspace((unsigned char) *ptr))
      ptr += pg_mblen(ptr);

    if (t_iseq(ptr, '#') || *ptr == '\0' ||
      t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
    {
      pfree(line);
      continue;
    }

    while (*ptr)
    {
      if (state == TR_WAITLEX)
      {
        if (t_iseq(ptr, ':'))
        {
          if (posinsubst == 0)
            ereport(ERROR,
                (errcode(ERRCODE_CONFIG_FILE_ERROR),
                 errmsg("unexpected delimiter")));
          state = TR_WAITSUBS;
        }
        else if (!isspace((unsigned char) *ptr))
        {
          beginwrd = ptr;
          state = TR_INLEX;
        }
      }
      else if (state == TR_INLEX)
      {
        if (t_iseq(ptr, ':'))
        {
          newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
          state = TR_WAITSUBS;
        }
        else if (isspace((unsigned char) *ptr))
        {
          newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
          state = TR_WAITLEX;
        }
      }
      else if (state == TR_WAITSUBS)
      {
        if (t_iseq(ptr, '*'))
        {
          useasis = true;
          state = TR_INSUBS;
          beginwrd = ptr + pg_mblen(ptr);
        }
        else if (t_iseq(ptr, '\\'))
        {
          useasis = false;
          state = TR_INSUBS;
          beginwrd = ptr + pg_mblen(ptr);
        }
        else if (!isspace((unsigned char) *ptr))
        {
          useasis = false;
          beginwrd = ptr;
          state = TR_INSUBS;
        }
      }
      else if (state == TR_INSUBS)
      {
        if (isspace((unsigned char) *ptr))
        {
          if (ptr == beginwrd)
            ereport(ERROR,
                (errcode(ERRCODE_CONFIG_FILE_ERROR),
                 errmsg("unexpected end of line or lexeme")));
          addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
          state = TR_WAITSUBS;
        }
      }
      else
        elog(ERROR, "unrecognized thesaurus state: %d", state);

      ptr += pg_mblen(ptr);
    }

    if (state == TR_INSUBS)
    {
      if (ptr == beginwrd)
        ereport(ERROR,
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
             errmsg("unexpected end of line or lexeme")));
      addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
    }

    idsubst++;

    if (!(nwrd && posinsubst))
      ereport(ERROR,
          (errcode(ERRCODE_CONFIG_FILE_ERROR),
           errmsg("unexpected end of line")));

    if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
      ereport(ERROR,
          (errcode(ERRCODE_CONFIG_FILE_ERROR),
           errmsg("too many lexemes in thesaurus entry")));

    pfree(line);
  }

  d->nsubst = idsubst;

  tsearch_readline_end(&trst);
}

static TheLexeme *
addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
{
  if (*nnw >= *tnm)
  {
    *tnm *= 2;
    newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
  }

  newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));

  if (lexeme && lexeme->lexeme)
  {
    newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
    newwrds[*nnw].entries->tnvariant = tnvariant;
  }
  else
  {
    newwrds[*nnw].lexeme = NULL;
    newwrds[*nnw].entries->tnvariant = 1;
  }

  newwrds[*nnw].entries->idsubst = src->idsubst;
  newwrds[*nnw].entries->posinsubst = src->posinsubst;

  newwrds[*nnw].entries->nextentry = NULL;

  (*nnw)++;
  return newwrds;
}

static int
cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
{
  if (a == NULL || b == NULL)
    return 0;

  if (a->idsubst == b->idsubst)
  {
    if (a->posinsubst == b->posinsubst)
    {
      if (a->tnvariant == b->tnvariant)
        return 0;

      return (a->tnvariant > b->tnvariant) ? 1 : -1;
    }

    return (a->posinsubst > b->posinsubst) ? 1 : -1;
  }

  return (a->idsubst > b->idsubst) ? 1 : -1;
}

static int
cmpLexeme(const TheLexeme *a, const TheLexeme *b)
{
  if (a->lexeme == NULL)
  {
    if (b->lexeme == NULL)
      return 0;
    else
      return 1;
  }
  else if (b->lexeme == NULL)
    return -1;

  return strcmp(a->lexeme, b->lexeme);
}

static int
cmpLexemeQ(const void *a, const void *b)
{
  return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
}

static int
cmpTheLexeme(const void *a, const void *b)
{
  const TheLexeme *la = (const TheLexeme *) a;
  const TheLexeme *lb = (const TheLexeme *) b;
  int     res;

  if ((res = cmpLexeme(la, lb)) != 0)
    return res;

  return -cmpLexemeInfo(la->entries, lb->entries);
}

static void
compileTheLexeme(DictThesaurus *d)
{
  int     i,
        nnw = 0,
        tnm = 16;
  TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
         *ptrwrds;

  for (i = 0; i < d->nwrds; i++)
  {
    TSLexeme   *ptr;

    if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
      newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
    else
    {
      ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
                               PointerGetDatum(d->subdict->dictData),
                               PointerGetDatum(d->wrds[i].lexeme),
                               Int32GetDatum(strlen(d->wrds[i].lexeme)),
                               PointerGetDatum(NULL)));

      if (!ptr)
        ereport(ERROR,
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
             errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
                d->wrds[i].lexeme,
                d->wrds[i].entries->idsubst + 1)));
      else if (!(ptr->lexeme))
        ereport(ERROR,
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
             errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
                d->wrds[i].lexeme,
                d->wrds[i].entries->idsubst + 1),
             errhint("Use \"?\" to represent a stop word within a sample phrase.")));
      else
      {
        while (ptr->lexeme)
        {
          TSLexeme   *remptr = ptr + 1;
          int     tnvar = 1;
          int     curvar = ptr->nvariant;

          /* compute n words in one variant */
          while (remptr->lexeme)
          {
            if (remptr->nvariant != (remptr - 1)->nvariant)
              break;
            tnvar++;
            remptr++;
          }

          remptr = ptr;
          while (remptr->lexeme && remptr->nvariant == curvar)
          {
            newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
            remptr++;
          }

          ptr = remptr;
        }
      }
    }

    pfree(d->wrds[i].lexeme);
    pfree(d->wrds[i].entries);
  }

  if (d->wrds)
    pfree(d->wrds);
  d->wrds = newwrds;
  d->nwrds = nnw;
  d->ntwrds = tnm;

  if (d->nwrds > 1)
  {
    qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);

    /* uniq */
    newwrds = d->wrds;
    ptrwrds = d->wrds + 1;
    while (ptrwrds - d->wrds < d->nwrds)
    {
      if (cmpLexeme(ptrwrds, newwrds) == 0)
      {
        if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
        {
          ptrwrds->entries->nextentry = newwrds->entries;
          newwrds->entries = ptrwrds->entries;
        }
        else
          pfree(ptrwrds->entries);

        if (ptrwrds->lexeme)
          pfree(ptrwrds->lexeme);
      }
      else
      {
        newwrds++;
        *newwrds = *ptrwrds;
      }

      ptrwrds++;
    }

    d->nwrds = newwrds - d->wrds + 1;
    d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
  }
}

static void
compileTheSubstitute(DictThesaurus *d)
{
  int     i;

  for (i = 0; i < d->nsubst; i++)
  {
    TSLexeme   *rem = d->subst[i].res,
           *outptr,
           *inptr;
    int     n = 2;

    outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
    outptr->lexeme = NULL;
    inptr = rem;

    while (inptr && inptr->lexeme)
    {
      TSLexeme   *lexized,
            tmplex[2];

      if (inptr->flags & DT_USEASIS)
      {         /* do not lexize */
        tmplex[0] = *inptr;
        tmplex[0].flags = 0;
        tmplex[1].lexeme = NULL;
        lexized = tmplex;
      }
      else
      {
        lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
                                   PointerGetDatum(d->subdict->dictData),
                                   PointerGetDatum(inptr->lexeme),
                                   Int32GetDatum(strlen(inptr->lexeme)),
                                   PointerGetDatum(NULL)));
      }

      if (lexized && lexized->lexeme)
      {
        int     toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;

        while (lexized->lexeme)
        {
          if (outptr - d->subst[i].res + 1 >= n)
          {
            int     diff = outptr - d->subst[i].res;

            n *= 2;
            d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
            outptr = d->subst[i].res + diff;
          }

          *outptr = *lexized;
          outptr->lexeme = pstrdup(lexized->lexeme);

          outptr++;
          lexized++;
        }

        if (toset > 0)
          d->subst[i].res[toset].flags |= TSL_ADDPOS;
      }
      else if (lexized)
      {
        ereport(ERROR,
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
             errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
                inptr->lexeme, i + 1)));
      }
      else
      {
        ereport(ERROR,
            (errcode(ERRCODE_CONFIG_FILE_ERROR),
             errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
                inptr->lexeme, i + 1)));
      }

      if (inptr->lexeme)
        pfree(inptr->lexeme);
      inptr++;
    }

    if (outptr == d->subst[i].res)
      ereport(ERROR,
          (errcode(ERRCODE_CONFIG_FILE_ERROR),
           errmsg("thesaurus substitute phrase is empty (rule %d)",
              i + 1)));

    d->subst[i].reslen = outptr - d->subst[i].res;

    pfree(rem);
  }
}

Datum
thesaurus_init(PG_FUNCTION_ARGS)
{
  List     *dictoptions = (List *) PG_GETARG_POINTER(0);
  DictThesaurus *d;
  char     *subdictname = NULL;
  bool    fileloaded = false;
  List     *namelist;
  ListCell   *l;

  d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));

  foreach(l, dictoptions)
  {
    DefElem    *defel = (DefElem *) lfirst(l);

    if (strcmp(defel->defname, "dictfile") == 0)
    {
      if (fileloaded)
        ereport(ERROR,
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
             errmsg("multiple DictFile parameters")));
      thesaurusRead(defGetString(defel), d);
      fileloaded = true;
    }
    else if (strcmp(defel->defname, "dictionary") == 0)
    {
      if (subdictname)
        ereport(ERROR,
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
             errmsg("multiple Dictionary parameters")));
      subdictname = pstrdup(defGetString(defel));
    }
    else
    {
      ereport(ERROR,
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
           errmsg("unrecognized Thesaurus parameter: \"%s\"",
              defel->defname)));
    }
  }

  if (!fileloaded)
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
         errmsg("missing DictFile parameter")));
  if (!subdictname)
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
         errmsg("missing Dictionary parameter")));

  namelist = stringToQualifiedNameList(subdictname, NULL);
  d->subdictOid = get_ts_dict_oid(namelist, false);
  d->subdict = lookup_ts_dictionary_cache(d->subdictOid);

  compileTheLexeme(d);
  compileTheSubstitute(d);

  PG_RETURN_POINTER(d);
}

static LexemeInfo *
findTheLexeme(DictThesaurus *d, char *lexeme)
{
  TheLexeme key,
         *res;

  if (d->nwrds == 0)
    return NULL;

  key.lexeme = lexeme;
  key.entries = NULL;

  res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);

  if (res == NULL)
    return NULL;
  return res->entries;
}

static bool
matchIdSubst(LexemeInfo *stored, uint32 idsubst)
{
  bool    res = true;

  if (stored)
  {
    res = false;

    for (; stored; stored = stored->nextvariant)
      if (stored->idsubst == idsubst)
      {
        res = true;
        break;
      }
  }

  return res;
}

static LexemeInfo *
findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
{
  for (;;)
  {
    int     i;
    LexemeInfo *ptr = newin[0];

    for (i = 0; i < newn; i++)
    {
      while (newin[i] && newin[i]->idsubst < ptr->idsubst)
        newin[i] = newin[i]->nextentry;

      if (newin[i] == NULL)
        return in;

      if (newin[i]->idsubst > ptr->idsubst)
      {
        ptr = newin[i];
        i = -1;
        continue;
      }

      while (newin[i]->idsubst == ptr->idsubst)
      {
        if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
        {
          ptr = newin[i];
          break;
        }

        newin[i] = newin[i]->nextentry;
        if (newin[i] == NULL)
          return in;
      }

      if (newin[i]->idsubst != ptr->idsubst)
      {
        ptr = newin[i];
        i = -1;
        continue;
      }
    }

    if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
    {           /* found */

      ptr->nextvariant = in;
      in = ptr;
    }

    /* step forward */
    for (i = 0; i < newn; i++)
      newin[i] = newin[i]->nextentry;
  }
}

static TSLexeme *
copyTSLexeme(TheSubstitute *ts)
{
  TSLexeme   *res;
  uint16    i;

  res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
  for (i = 0; i < ts->reslen; i++)
  {
    res[i] = ts->res[i];
    res[i].lexeme = pstrdup(ts->res[i].lexeme);
  }

  res[ts->reslen].lexeme = NULL;

  return res;
}

static TSLexeme *
checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
{
  *moreres = false;
  while (info)
  {
    Assert(info->idsubst < d->nsubst);
    if (info->nextvariant)
      *moreres = true;
    if (d->subst[info->idsubst].lastlexeme == curpos)
      return copyTSLexeme(d->subst + info->idsubst);
    info = info->nextvariant;
  }

  return NULL;
}

Datum
thesaurus_lexize(PG_FUNCTION_ARGS)
{
  DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
  DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
  TSLexeme   *res = NULL;
  LexemeInfo *stored,
         *info = NULL;
  uint16    curpos = 0;
  bool    moreres = false;

  if (PG_NARGS() != 4 || dstate == NULL)
    elog(ERROR, "forbidden call of thesaurus or nested call");

  if (dstate->isend)
    PG_RETURN_POINTER(NULL);
  stored = (LexemeInfo *) dstate->private_state;

  if (stored)
    curpos = stored->posinsubst + 1;

  if (!d->subdict->isvalid)
    d->subdict = lookup_ts_dictionary_cache(d->subdictOid);

  res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
                           PointerGetDatum(d->subdict->dictData),
                           PG_GETARG_DATUM(1),
                           PG_GETARG_DATUM(2),
                           PointerGetDatum(NULL)));

  if (res && res->lexeme)
  {
    TSLexeme   *ptr = res,
           *basevar;

    while (ptr->lexeme)
    {
      uint16    nv = ptr->nvariant;
      uint16    i,
            nlex = 0;
      LexemeInfo **infos;

      basevar = ptr;
      while (ptr->lexeme && nv == ptr->nvariant)
      {
        nlex++;
        ptr++;
      }

      infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
      for (i = 0; i < nlex; i++)
        if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
          break;

      if (i < nlex)
      {
        /* no chance to find */
        pfree(infos);
        continue;
      }

      info = findVariant(info, stored, curpos, infos, nlex);
    }
  }
  else if (res)
  {             /* stop-word */
    LexemeInfo *infos = findTheLexeme(d, NULL);

    info = findVariant(NULL, stored, curpos, &infos, 1);
  }
  else
  {
    info = NULL;      /* word isn't recognized */
  }

  dstate->private_state = info;

  if (!info)
  {
    dstate->getnext = false;
    PG_RETURN_POINTER(NULL);
  }

  if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
  {
    dstate->getnext = moreres;
    PG_RETURN_POINTER(res);
  }

  dstate->getnext = true;

  PG_RETURN_POINTER(NULL);
}

Coverage Report

Created: 2025-07-03 06:49

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* dict_thesaurus.c
4		* Thesaurus dictionary: phrase to phrase substitution
5		*
6		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7		*
8		*
9		* IDENTIFICATION
10		* src/backend/tsearch/dict_thesaurus.c
11		*
12		*-------------------------------------------------------------------------
13		*/
14		#include "postgres.h"
15
16		#include "catalog/namespace.h"
17		#include "commands/defrem.h"
18		#include "tsearch/ts_cache.h"
19		#include "tsearch/ts_locale.h"
20		#include "tsearch/ts_public.h"
21		#include "utils/fmgrprotos.h"
22		#include "utils/regproc.h"
23
24
25		/*
26		* Temporary we use TSLexeme.flags for inner use...
27		*/
28	0	#define DT_USEASIS 0x1000
29
30		typedef struct LexemeInfo
31		{
32		uint32 idsubst; /* entry's number in DictThesaurus->subst */
33		uint16 posinsubst; /* pos info in entry */
34		uint16 tnvariant; /* total num lexemes in one variant */
35		struct LexemeInfo *nextentry;
36		struct LexemeInfo *nextvariant;
37		} LexemeInfo;
38
39		typedef struct
40		{
41		char *lexeme;
42		LexemeInfo *entries;
43		} TheLexeme;
44
45		typedef struct
46		{
47		uint16 lastlexeme; /* number lexemes to substitute */
48		uint16 reslen;
49		TSLexeme res; / prepared substituted result */
50		} TheSubstitute;
51
52		typedef struct
53		{
54		/* subdictionary to normalize lexemes */
55		Oid subdictOid;
56		TSDictionaryCacheEntry *subdict;
57
58		/* Array to search lexeme by exact match */
59		TheLexeme *wrds;
60		int nwrds; /* current number of words */
61		int ntwrds; /* allocated array length */
62
63		/*
64		* Storage of substituted result, n-th element is for n-th expression
65		*/
66		TheSubstitute *subst;
67		int nsubst;
68		} DictThesaurus;
69
70
71		static void
72		newLexeme(DictThesaurus d, char b, char *e, uint32 idsubst, uint16 posinsubst)
73	0	{
74	0	TheLexeme *ptr;
75
76	0	if (d->nwrds >= d->ntwrds)
77	0	{
78	0	if (d->ntwrds == 0)
79	0	{
80	0	d->ntwrds = 16;
81	0	d->wrds = (TheLexeme ) palloc(sizeof(TheLexeme) d->ntwrds);
82	0	}
83	0	else
84	0	{
85	0	d->ntwrds *= 2;
86	0	d->wrds = (TheLexeme ) repalloc(d->wrds, sizeof(TheLexeme) d->ntwrds);
87	0	}
88	0	}
89
90	0	ptr = d->wrds + d->nwrds;
91	0	d->nwrds++;
92
93	0	ptr->lexeme = palloc(e - b + 1);
94
95	0	memcpy(ptr->lexeme, b, e - b);
96	0	ptr->lexeme[e - b] = '\0';
97
98	0	ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
99
100	0	ptr->entries->nextentry = NULL;
101	0	ptr->entries->idsubst = idsubst;
102	0	ptr->entries->posinsubst = posinsubst;
103	0	}
104
105		static void
106		addWrd(DictThesaurus d, char b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
107	0	{
108	0	static int nres = 0;
109	0	static int ntres = 0;
110	0	TheSubstitute *ptr;
111
112	0	if (nwrd == 0)
113	0	{
114	0	nres = ntres = 0;
115
116	0	if (idsubst >= d->nsubst)
117	0	{
118	0	if (d->nsubst == 0)
119	0	{
120	0	d->nsubst = 16;
121	0	d->subst = (TheSubstitute ) palloc(sizeof(TheSubstitute) d->nsubst);
122	0	}
123	0	else
124	0	{
125	0	d->nsubst *= 2;
126	0	d->subst = (TheSubstitute ) repalloc(d->subst, sizeof(TheSubstitute) d->nsubst);
127	0	}
128	0	}
129	0	}
130
131	0	ptr = d->subst + idsubst;
132
133	0	ptr->lastlexeme = posinsubst - 1;
134
135	0	if (nres + 1 >= ntres)
136	0	{
137	0	if (ntres == 0)
138	0	{
139	0	ntres = 2;
140	0	ptr->res = (TSLexeme ) palloc(sizeof(TSLexeme) ntres);
141	0	}
142	0	else
143	0	{
144	0	ntres *= 2;
145	0	ptr->res = (TSLexeme ) repalloc(ptr->res, sizeof(TSLexeme) ntres);
146	0	}
147	0	}
148
149	0	ptr->res[nres].lexeme = palloc(e - b + 1);
150	0	memcpy(ptr->res[nres].lexeme, b, e - b);
151	0	ptr->res[nres].lexeme[e - b] = '\0';
152
153	0	ptr->res[nres].nvariant = nwrd;
154	0	if (useasis)
155	0	ptr->res[nres].flags = DT_USEASIS;
156	0	else
157	0	ptr->res[nres].flags = 0;
158
159	0	ptr->res[++nres].lexeme = NULL;
160	0	}
161
162	0	#define TR_WAITLEX 1
163	0	#define TR_INLEX 2
164	0	#define TR_WAITSUBS 3
165	0	#define TR_INSUBS 4
166
167		static void
168		thesaurusRead(const char filename, DictThesaurus d)
169	0	{
170	0	tsearch_readline_state trst;
171	0	uint32 idsubst = 0;
172	0	bool useasis = false;
173	0	char *line;
174
175	0	filename = get_tsearch_config_filename(filename, "ths");
176	0	if (!tsearch_readline_begin(&trst, filename))
177	0	ereport(ERROR,
178	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
179	0	errmsg("could not open thesaurus file \"%s\": %m",
180	0	filename)));
181
182	0	while ((line = tsearch_readline(&trst)) != NULL)
183	0	{
184	0	char *ptr;
185	0	int state = TR_WAITLEX;
186	0	char *beginwrd = NULL;
187	0	uint32 posinsubst = 0;
188	0	uint32 nwrd = 0;
189
190	0	ptr = line;
191
192		/* is it a comment? */
193	0	while (ptr && isspace((unsigned char) ptr))
194	0	ptr += pg_mblen(ptr);
195
196	0	if (t_iseq(ptr, '#') \|\| *ptr == '\0' \|\|
197	0	t_iseq(ptr, '\n') \|\| t_iseq(ptr, '\r'))
198	0	{
199	0	pfree(line);
200	0	continue;
201	0	}
202
203	0	while (*ptr)
204	0	{
205	0	if (state == TR_WAITLEX)
206	0	{
207	0	if (t_iseq(ptr, ':'))
208	0	{
209	0	if (posinsubst == 0)
210	0	ereport(ERROR,
211	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
212	0	errmsg("unexpected delimiter")));
213	0	state = TR_WAITSUBS;
214	0	}
215	0	else if (!isspace((unsigned char) *ptr))
216	0	{
217	0	beginwrd = ptr;
218	0	state = TR_INLEX;
219	0	}
220	0	}
221	0	else if (state == TR_INLEX)
222	0	{
223	0	if (t_iseq(ptr, ':'))
224	0	{
225	0	newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
226	0	state = TR_WAITSUBS;
227	0	}
228	0	else if (isspace((unsigned char) *ptr))
229	0	{
230	0	newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
231	0	state = TR_WAITLEX;
232	0	}
233	0	}
234	0	else if (state == TR_WAITSUBS)
235	0	{
236	0	if (t_iseq(ptr, '*'))
237	0	{
238	0	useasis = true;
239	0	state = TR_INSUBS;
240	0	beginwrd = ptr + pg_mblen(ptr);
241	0	}
242	0	else if (t_iseq(ptr, '\\'))
243	0	{
244	0	useasis = false;
245	0	state = TR_INSUBS;
246	0	beginwrd = ptr + pg_mblen(ptr);
247	0	}
248	0	else if (!isspace((unsigned char) *ptr))
249	0	{
250	0	useasis = false;
251	0	beginwrd = ptr;
252	0	state = TR_INSUBS;
253	0	}
254	0	}
255	0	else if (state == TR_INSUBS)
256	0	{
257	0	if (isspace((unsigned char) *ptr))
258	0	{
259	0	if (ptr == beginwrd)
260	0	ereport(ERROR,
261	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
262	0	errmsg("unexpected end of line or lexeme")));
263	0	addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
264	0	state = TR_WAITSUBS;
265	0	}
266	0	}
267	0	else
268	0	elog(ERROR, "unrecognized thesaurus state: %d", state);
269
270	0	ptr += pg_mblen(ptr);
271	0	}
272
273	0	if (state == TR_INSUBS)
274	0	{
275	0	if (ptr == beginwrd)
276	0	ereport(ERROR,
277	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
278	0	errmsg("unexpected end of line or lexeme")));
279	0	addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
280	0	}
281
282	0	idsubst++;
283
284	0	if (!(nwrd && posinsubst))
285	0	ereport(ERROR,
286	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
287	0	errmsg("unexpected end of line")));
288
289	0	if (nwrd != (uint16) nwrd \|\| posinsubst != (uint16) posinsubst)
290	0	ereport(ERROR,
291	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
292	0	errmsg("too many lexemes in thesaurus entry")));
293
294	0	pfree(line);
295	0	}
296
297	0	d->nsubst = idsubst;
298
299	0	tsearch_readline_end(&trst);
300	0	}
301
302		static TheLexeme *
303		addCompiledLexeme(TheLexeme newwrds, int nnw, int tnm, TSLexeme lexeme, LexemeInfo *src, uint16 tnvariant)
304	0	{
305	0	if (nnw >= tnm)
306	0	{
307	0	tnm = 2;
308	0	newwrds = (TheLexeme ) repalloc(newwrds, sizeof(TheLexeme) *tnm);
309	0	}
310
311	0	newwrds[nnw].entries = (LexemeInfo ) palloc(sizeof(LexemeInfo));
312
313	0	if (lexeme && lexeme->lexeme)
314	0	{
315	0	newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
316	0	newwrds[*nnw].entries->tnvariant = tnvariant;
317	0	}
318	0	else
319	0	{
320	0	newwrds[*nnw].lexeme = NULL;
321	0	newwrds[*nnw].entries->tnvariant = 1;
322	0	}
323
324	0	newwrds[*nnw].entries->idsubst = src->idsubst;
325	0	newwrds[*nnw].entries->posinsubst = src->posinsubst;
326
327	0	newwrds[*nnw].entries->nextentry = NULL;
328
329	0	(*nnw)++;
330	0	return newwrds;
331	0	}
332
333		static int
334		cmpLexemeInfo(LexemeInfo a, LexemeInfo b)
335	0	{
336	0	if (a == NULL \|\| b == NULL)
337	0	return 0;
338
339	0	if (a->idsubst == b->idsubst)
340	0	{
341	0	if (a->posinsubst == b->posinsubst)
342	0	{
343	0	if (a->tnvariant == b->tnvariant)
344	0	return 0;
345
346	0	return (a->tnvariant > b->tnvariant) ? 1 : -1;
347	0	}
348
349	0	return (a->posinsubst > b->posinsubst) ? 1 : -1;
350	0	}
351
352	0	return (a->idsubst > b->idsubst) ? 1 : -1;
353	0	}
354
355		static int
356		cmpLexeme(const TheLexeme a, const TheLexeme b)
357	0	{
358	0	if (a->lexeme == NULL)
359	0	{
360	0	if (b->lexeme == NULL)
361	0	return 0;
362	0	else
363	0	return 1;
364	0	}
365	0	else if (b->lexeme == NULL)
366	0	return -1;
367
368	0	return strcmp(a->lexeme, b->lexeme);
369	0	}
370
371		static int
372		cmpLexemeQ(const void a, const void b)
373	0	{
374	0	return cmpLexeme((const TheLexeme ) a, (const TheLexeme ) b);
375	0	}
376
377		static int
378		cmpTheLexeme(const void a, const void b)
379	0	{
380	0	const TheLexeme la = (const TheLexeme ) a;
381	0	const TheLexeme lb = (const TheLexeme ) b;
382	0	int res;
383
384	0	if ((res = cmpLexeme(la, lb)) != 0)
385	0	return res;
386
387	0	return -cmpLexemeInfo(la->entries, lb->entries);
388	0	}
389
390		static void
391		compileTheLexeme(DictThesaurus *d)
392	0	{
393	0	int i,
394	0	nnw = 0,
395	0	tnm = 16;
396	0	TheLexeme newwrds = (TheLexeme ) palloc(sizeof(TheLexeme) * tnm),
397	0	*ptrwrds;
398
399	0	for (i = 0; i < d->nwrds; i++)
400	0	{
401	0	TSLexeme *ptr;
402
403	0	if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
404	0	newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
405	0	else
406	0	{
407	0	ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
408	0	PointerGetDatum(d->subdict->dictData),
409	0	PointerGetDatum(d->wrds[i].lexeme),
410	0	Int32GetDatum(strlen(d->wrds[i].lexeme)),
411	0	PointerGetDatum(NULL)));
412
413	0	if (!ptr)
414	0	ereport(ERROR,
415	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
416	0	errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
417	0	d->wrds[i].lexeme,
418	0	d->wrds[i].entries->idsubst + 1)));
419	0	else if (!(ptr->lexeme))
420	0	ereport(ERROR,
421	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
422	0	errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
423	0	d->wrds[i].lexeme,
424	0	d->wrds[i].entries->idsubst + 1),
425	0	errhint("Use \"?\" to represent a stop word within a sample phrase.")));
426	0	else
427	0	{
428	0	while (ptr->lexeme)
429	0	{
430	0	TSLexeme *remptr = ptr + 1;
431	0	int tnvar = 1;
432	0	int curvar = ptr->nvariant;
433
434		/* compute n words in one variant */
435	0	while (remptr->lexeme)
436	0	{
437	0	if (remptr->nvariant != (remptr - 1)->nvariant)
438	0	break;
439	0	tnvar++;
440	0	remptr++;
441	0	}
442
443	0	remptr = ptr;
444	0	while (remptr->lexeme && remptr->nvariant == curvar)
445	0	{
446	0	newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
447	0	remptr++;
448	0	}
449
450	0	ptr = remptr;
451	0	}
452	0	}
453	0	}
454
455	0	pfree(d->wrds[i].lexeme);
456	0	pfree(d->wrds[i].entries);
457	0	}
458
459	0	if (d->wrds)
460	0	pfree(d->wrds);
461	0	d->wrds = newwrds;
462	0	d->nwrds = nnw;
463	0	d->ntwrds = tnm;
464
465	0	if (d->nwrds > 1)
466	0	{
467	0	qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
468
469		/* uniq */
470	0	newwrds = d->wrds;
471	0	ptrwrds = d->wrds + 1;
472	0	while (ptrwrds - d->wrds < d->nwrds)
473	0	{
474	0	if (cmpLexeme(ptrwrds, newwrds) == 0)
475	0	{
476	0	if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
477	0	{
478	0	ptrwrds->entries->nextentry = newwrds->entries;
479	0	newwrds->entries = ptrwrds->entries;
480	0	}
481	0	else
482	0	pfree(ptrwrds->entries);
483
484	0	if (ptrwrds->lexeme)
485	0	pfree(ptrwrds->lexeme);
486	0	}
487	0	else
488	0	{
489	0	newwrds++;
490	0	newwrds = ptrwrds;
491	0	}
492
493	0	ptrwrds++;
494	0	}
495
496	0	d->nwrds = newwrds - d->wrds + 1;
497	0	d->wrds = (TheLexeme ) repalloc(d->wrds, sizeof(TheLexeme) d->nwrds);
498	0	}
499	0	}
500
501		static void
502		compileTheSubstitute(DictThesaurus *d)
503	0	{
504	0	int i;
505
506	0	for (i = 0; i < d->nsubst; i++)
507	0	{
508	0	TSLexeme *rem = d->subst[i].res,
509	0	*outptr,
510	0	*inptr;
511	0	int n = 2;
512
513	0	outptr = d->subst[i].res = (TSLexeme ) palloc(sizeof(TSLexeme) n);
514	0	outptr->lexeme = NULL;
515	0	inptr = rem;
516
517	0	while (inptr && inptr->lexeme)
518	0	{
519	0	TSLexeme *lexized,
520	0	tmplex[2];
521
522	0	if (inptr->flags & DT_USEASIS)
523	0	{ /* do not lexize */
524	0	tmplex[0] = *inptr;
525	0	tmplex[0].flags = 0;
526	0	tmplex[1].lexeme = NULL;
527	0	lexized = tmplex;
528	0	}
529	0	else
530	0	{
531	0	lexized = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
532	0	PointerGetDatum(d->subdict->dictData),
533	0	PointerGetDatum(inptr->lexeme),
534	0	Int32GetDatum(strlen(inptr->lexeme)),
535	0	PointerGetDatum(NULL)));
536	0	}
537
538	0	if (lexized && lexized->lexeme)
539	0	{
540	0	int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
541
542	0	while (lexized->lexeme)
543	0	{
544	0	if (outptr - d->subst[i].res + 1 >= n)
545	0	{
546	0	int diff = outptr - d->subst[i].res;
547
548	0	n *= 2;
549	0	d->subst[i].res = (TSLexeme ) repalloc(d->subst[i].res, sizeof(TSLexeme) n);
550	0	outptr = d->subst[i].res + diff;
551	0	}
552
553	0	outptr = lexized;
554	0	outptr->lexeme = pstrdup(lexized->lexeme);
555
556	0	outptr++;
557	0	lexized++;
558	0	}
559
560	0	if (toset > 0)
561	0	d->subst[i].res[toset].flags \|= TSL_ADDPOS;
562	0	}
563	0	else if (lexized)
564	0	{
565	0	ereport(ERROR,
566	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
567	0	errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
568	0	inptr->lexeme, i + 1)));
569	0	}
570	0	else
571	0	{
572	0	ereport(ERROR,
573	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
574	0	errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
575	0	inptr->lexeme, i + 1)));
576	0	}
577
578	0	if (inptr->lexeme)
579	0	pfree(inptr->lexeme);
580	0	inptr++;
581	0	}
582
583	0	if (outptr == d->subst[i].res)
584	0	ereport(ERROR,
585	0	(errcode(ERRCODE_CONFIG_FILE_ERROR),
586	0	errmsg("thesaurus substitute phrase is empty (rule %d)",
587	0	i + 1)));
588
589	0	d->subst[i].reslen = outptr - d->subst[i].res;
590
591	0	pfree(rem);
592	0	}
593	0	}
594
595		Datum
596		thesaurus_init(PG_FUNCTION_ARGS)
597	0	{
598	0	List dictoptions = (List ) PG_GETARG_POINTER(0);
599	0	DictThesaurus *d;
600	0	char *subdictname = NULL;
601	0	bool fileloaded = false;
602	0	List *namelist;
603	0	ListCell *l;
604
605	0	d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
606
607	0	foreach(l, dictoptions)
608	0	{
609	0	DefElem defel = (DefElem ) lfirst(l);
610
611	0	if (strcmp(defel->defname, "dictfile") == 0)
612	0	{
613	0	if (fileloaded)
614	0	ereport(ERROR,
615	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
616	0	errmsg("multiple DictFile parameters")));
617	0	thesaurusRead(defGetString(defel), d);
618	0	fileloaded = true;
619	0	}
620	0	else if (strcmp(defel->defname, "dictionary") == 0)
621	0	{
622	0	if (subdictname)
623	0	ereport(ERROR,
624	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
625	0	errmsg("multiple Dictionary parameters")));
626	0	subdictname = pstrdup(defGetString(defel));
627	0	}
628	0	else
629	0	{
630	0	ereport(ERROR,
631	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
632	0	errmsg("unrecognized Thesaurus parameter: \"%s\"",
633	0	defel->defname)));
634	0	}
635	0	}
636
637	0	if (!fileloaded)
638	0	ereport(ERROR,
639	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
640	0	errmsg("missing DictFile parameter")));
641	0	if (!subdictname)
642	0	ereport(ERROR,
643	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
644	0	errmsg("missing Dictionary parameter")));
645
646	0	namelist = stringToQualifiedNameList(subdictname, NULL);
647	0	d->subdictOid = get_ts_dict_oid(namelist, false);
648	0	d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
649
650	0	compileTheLexeme(d);
651	0	compileTheSubstitute(d);
652
653	0	PG_RETURN_POINTER(d);
654	0	}
655
656		static LexemeInfo *
657		findTheLexeme(DictThesaurus d, char lexeme)
658	0	{
659	0	TheLexeme key,
660	0	*res;
661
662	0	if (d->nwrds == 0)
663	0	return NULL;
664
665	0	key.lexeme = lexeme;
666	0	key.entries = NULL;
667
668	0	res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
669
670	0	if (res == NULL)
671	0	return NULL;
672	0	return res->entries;
673	0	}
674
675		static bool
676		matchIdSubst(LexemeInfo *stored, uint32 idsubst)
677	0	{
678	0	bool res = true;
679
680	0	if (stored)
681	0	{
682	0	res = false;
683
684	0	for (; stored; stored = stored->nextvariant)
685	0	if (stored->idsubst == idsubst)
686	0	{
687	0	res = true;
688	0	break;
689	0	}
690	0	}
691
692	0	return res;
693	0	}
694
695		static LexemeInfo *
696		findVariant(LexemeInfo in, LexemeInfo stored, uint16 curpos, LexemeInfo **newin, int newn)
697	0	{
698	0	for (;;)
699	0	{
700	0	int i;
701	0	LexemeInfo *ptr = newin[0];
702
703	0	for (i = 0; i < newn; i++)
704	0	{
705	0	while (newin[i] && newin[i]->idsubst < ptr->idsubst)
706	0	newin[i] = newin[i]->nextentry;
707
708	0	if (newin[i] == NULL)
709	0	return in;
710
711	0	if (newin[i]->idsubst > ptr->idsubst)
712	0	{
713	0	ptr = newin[i];
714	0	i = -1;
715	0	continue;
716	0	}
717
718	0	while (newin[i]->idsubst == ptr->idsubst)
719	0	{
720	0	if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
721	0	{
722	0	ptr = newin[i];
723	0	break;
724	0	}
725
726	0	newin[i] = newin[i]->nextentry;
727	0	if (newin[i] == NULL)
728	0	return in;
729	0	}
730
731	0	if (newin[i]->idsubst != ptr->idsubst)
732	0	{
733	0	ptr = newin[i];
734	0	i = -1;
735	0	continue;
736	0	}
737	0	}
738
739	0	if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL \|\| !matchIdSubst(in, ptr->idsubst)))
740	0	{ /* found */
741
742	0	ptr->nextvariant = in;
743	0	in = ptr;
744	0	}
745
746		/* step forward */
747	0	for (i = 0; i < newn; i++)
748	0	newin[i] = newin[i]->nextentry;
749	0	}
750	0	}
751
752		static TSLexeme *
753		copyTSLexeme(TheSubstitute *ts)
754	0	{
755	0	TSLexeme *res;
756	0	uint16 i;
757
758	0	res = (TSLexeme ) palloc(sizeof(TSLexeme) (ts->reslen + 1));
759	0	for (i = 0; i < ts->reslen; i++)
760	0	{
761	0	res[i] = ts->res[i];
762	0	res[i].lexeme = pstrdup(ts->res[i].lexeme);
763	0	}
764
765	0	res[ts->reslen].lexeme = NULL;
766
767	0	return res;
768	0	}
769
770		static TSLexeme *
771		checkMatch(DictThesaurus d, LexemeInfo info, uint16 curpos, bool *moreres)
772	0	{
773	0	*moreres = false;
774	0	while (info)
775	0	{
776	0	Assert(info->idsubst < d->nsubst);
777	0	if (info->nextvariant)
778	0	*moreres = true;
779	0	if (d->subst[info->idsubst].lastlexeme == curpos)
780	0	return copyTSLexeme(d->subst + info->idsubst);
781	0	info = info->nextvariant;
782	0	}
783
784	0	return NULL;
785	0	}
786
787		Datum
788		thesaurus_lexize(PG_FUNCTION_ARGS)
789	0	{
790	0	DictThesaurus d = (DictThesaurus ) PG_GETARG_POINTER(0);
791	0	DictSubState dstate = (DictSubState ) PG_GETARG_POINTER(3);
792	0	TSLexeme *res = NULL;
793	0	LexemeInfo *stored,
794	0	*info = NULL;
795	0	uint16 curpos = 0;
796	0	bool moreres = false;
797
798	0	if (PG_NARGS() != 4 \|\| dstate == NULL)
799	0	elog(ERROR, "forbidden call of thesaurus or nested call");
800
801	0	if (dstate->isend)
802	0	PG_RETURN_POINTER(NULL);
803	0	stored = (LexemeInfo *) dstate->private_state;
804
805	0	if (stored)
806	0	curpos = stored->posinsubst + 1;
807
808	0	if (!d->subdict->isvalid)
809	0	d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
810
811	0	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
812	0	PointerGetDatum(d->subdict->dictData),
813	0	PG_GETARG_DATUM(1),
814	0	PG_GETARG_DATUM(2),
815	0	PointerGetDatum(NULL)));
816
817	0	if (res && res->lexeme)
818	0	{
819	0	TSLexeme *ptr = res,
820	0	*basevar;
821
822	0	while (ptr->lexeme)
823	0	{
824	0	uint16 nv = ptr->nvariant;
825	0	uint16 i,
826	0	nlex = 0;
827	0	LexemeInfo **infos;
828
829	0	basevar = ptr;
830	0	while (ptr->lexeme && nv == ptr->nvariant)
831	0	{
832	0	nlex++;
833	0	ptr++;
834	0	}
835
836	0	infos = (LexemeInfo *) palloc(sizeof(LexemeInfo ) * nlex);
837	0	for (i = 0; i < nlex; i++)
838	0	if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
839	0	break;
840
841	0	if (i < nlex)
842	0	{
843		/* no chance to find */
844	0	pfree(infos);
845	0	continue;
846	0	}
847
848	0	info = findVariant(info, stored, curpos, infos, nlex);
849	0	}
850	0	}
851	0	else if (res)
852	0	{ /* stop-word */
853	0	LexemeInfo *infos = findTheLexeme(d, NULL);
854
855	0	info = findVariant(NULL, stored, curpos, &infos, 1);
856	0	}
857	0	else
858	0	{
859	0	info = NULL; /* word isn't recognized */
860	0	}
861
862	0	dstate->private_state = info;
863
864	0	if (!info)
865	0	{
866	0	dstate->getnext = false;
867	0	PG_RETURN_POINTER(NULL);
868	0	}
869
870	0	if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
871	0	{
872	0	dstate->getnext = moreres;
873	0	PG_RETURN_POINTER(res);
874	0	}
875
876	0	dstate->getnext = true;
877
878	0	PG_RETURN_POINTER(NULL);
879	0	}