/src/postgres/src/backend/tsearch/ts_parse.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * ts_parse.c
 *    main parse functions for tsearch
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *    src/backend/tsearch/ts_parse.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#include "varatt.h"

#define IGNORE_LONGLEXEME 1

/*
 * Lexize subsystem
 */

typedef struct ParsedLex
{
  int     type;
  char     *lemm;
  int     lenlemm;
  struct ParsedLex *next;
} ParsedLex;

typedef struct ListParsedLex
{
  ParsedLex  *head;
  ParsedLex  *tail;
} ListParsedLex;

typedef struct
{
  TSConfigCacheEntry *cfg;
  Oid     curDictId;
  int     posDict;
  DictSubState dictState;
  ParsedLex  *curSub;
  ListParsedLex towork;   /* current list to work */
  ListParsedLex waste;    /* list of lexemes that already lexized */

  /*
   * fields to store last variant to lexize (basically, thesaurus or similar
   * to, which wants several lexemes
   */

  ParsedLex  *lastRes;
  TSLexeme   *tmpRes;
} LexizeData;

static void
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
{
  ld->cfg = cfg;
  ld->curDictId = InvalidOid;
  ld->posDict = 0;
  ld->towork.head = ld->towork.tail = ld->curSub = NULL;
  ld->waste.head = ld->waste.tail = NULL;
  ld->lastRes = NULL;
  ld->tmpRes = NULL;
}

static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
{
  if (list->tail)
  {
    list->tail->next = newpl;
    list->tail = newpl;
  }
  else
    list->head = list->tail = newpl;
  newpl->next = NULL;
}

static ParsedLex *
LPLRemoveHead(ListParsedLex *list)
{
  ParsedLex  *res = list->head;

  if (list->head)
    list->head = list->head->next;

  if (list->head == NULL)
    list->tail = NULL;

  return res;
}

static void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
{
  ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));

  newpl->type = type;
  newpl->lemm = lemm;
  newpl->lenlemm = lenlemm;
  LPLAddTail(&ld->towork, newpl);
  ld->curSub = ld->towork.tail;
}

static void
RemoveHead(LexizeData *ld)
{
  LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));

  ld->posDict = 0;
}

static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
{
  if (correspondLexem)
  {
    *correspondLexem = ld->waste.head;
  }
  else
  {
    ParsedLex  *tmp,
           *ptr = ld->waste.head;

    while (ptr)
    {
      tmp = ptr->next;
      pfree(ptr);
      ptr = tmp;
    }
  }
  ld->waste.head = ld->waste.tail = NULL;
}

static void
moveToWaste(LexizeData *ld, ParsedLex *stop)
{
  bool    go = true;

  while (ld->towork.head && go)
  {
    if (ld->towork.head == stop)
    {
      ld->curSub = stop->next;
      go = false;
    }
    RemoveHead(ld);
  }
}

static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
{
  if (ld->tmpRes)
  {
    TSLexeme   *ptr;

    for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
      pfree(ptr->lexeme);
    pfree(ld->tmpRes);
  }
  ld->tmpRes = res;
  ld->lastRes = lex;
}

static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{
  int     i;
  ListDictionary *map;
  TSDictionaryCacheEntry *dict;
  TSLexeme   *res;

  if (ld->curDictId == InvalidOid)
  {
    /*
     * usual mode: dictionary wants only one word, but we should keep in
     * mind that we should go through all stack
     */

    while (ld->towork.head)
    {
      ParsedLex  *curVal = ld->towork.head;
      char     *curValLemm = curVal->lemm;
      int     curValLenLemm = curVal->lenlemm;

      map = ld->cfg->map + curVal->type;

      if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
      {
        /* skip this type of lexeme */
        RemoveHead(ld);
        continue;
      }

      for (i = ld->posDict; i < map->len; i++)
      {
        dict = lookup_ts_dictionary_cache(map->dictIds[i]);

        ld->dictState.isend = ld->dictState.getnext = false;
        ld->dictState.private_state = NULL;
        res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
                                 PointerGetDatum(dict->dictData),
                                 PointerGetDatum(curValLemm),
                                 Int32GetDatum(curValLenLemm),
                                 PointerGetDatum(&ld->dictState)));

        if (ld->dictState.getnext)
        {
          /*
           * dictionary wants next word, so setup and store current
           * position and go to multiword mode
           */

          ld->curDictId = map->dictIds[i];
          ld->posDict = i + 1;
          ld->curSub = curVal->next;
          if (res)
            setNewTmpRes(ld, curVal, res);
          return LexizeExec(ld, correspondLexem);
        }

        if (!res)   /* dictionary doesn't know this lexeme */
          continue;

        if (res->flags & TSL_FILTER)
        {
          curValLemm = res->lexeme;
          curValLenLemm = strlen(res->lexeme);
          continue;
        }

        RemoveHead(ld);
        setCorrLex(ld, correspondLexem);
        return res;
      }

      RemoveHead(ld);
    }
  }
  else
  {             /* curDictId is valid */
    dict = lookup_ts_dictionary_cache(ld->curDictId);

    /*
     * Dictionary ld->curDictId asks us about following words
     */

    while (ld->curSub)
    {
      ParsedLex  *curVal = ld->curSub;

      map = ld->cfg->map + curVal->type;

      if (curVal->type != 0)
      {
        bool    dictExists = false;

        if (curVal->type >= ld->cfg->lenmap || map->len == 0)
        {
          /* skip this type of lexeme */
          ld->curSub = curVal->next;
          continue;
        }

        /*
         * We should be sure that current type of lexeme is recognized
         * by our dictionary: we just check is it exist in list of
         * dictionaries ?
         */
        for (i = 0; i < map->len && !dictExists; i++)
          if (ld->curDictId == map->dictIds[i])
            dictExists = true;

        if (!dictExists)
        {
          /*
           * Dictionary can't work with current type of lexeme,
           * return to basic mode and redo all stored lexemes
           */
          ld->curDictId = InvalidOid;
          return LexizeExec(ld, correspondLexem);
        }
      }

      ld->dictState.isend = (curVal->type == 0);
      ld->dictState.getnext = false;

      res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
                               PointerGetDatum(dict->dictData),
                               PointerGetDatum(curVal->lemm),
                               Int32GetDatum(curVal->lenlemm),
                               PointerGetDatum(&ld->dictState)));

      if (ld->dictState.getnext)
      {
        /* Dictionary wants one more */
        ld->curSub = curVal->next;
        if (res)
          setNewTmpRes(ld, curVal, res);
        continue;
      }

      if (res || ld->tmpRes)
      {
        /*
         * Dictionary normalizes lexemes, so we remove from stack all
         * used lexemes, return to basic mode and redo end of stack
         * (if it exists)
         */
        if (res)
        {
          moveToWaste(ld, ld->curSub);
        }
        else
        {
          res = ld->tmpRes;
          moveToWaste(ld, ld->lastRes);
        }

        /* reset to initial state */
        ld->curDictId = InvalidOid;
        ld->posDict = 0;
        ld->lastRes = NULL;
        ld->tmpRes = NULL;
        setCorrLex(ld, correspondLexem);
        return res;
      }

      /*
       * Dict don't want next lexem and didn't recognize anything, redo
       * from ld->towork.head
       */
      ld->curDictId = InvalidOid;
      return LexizeExec(ld, correspondLexem);
    }
  }

  setCorrLex(ld, correspondLexem);
  return NULL;
}

/*
 * Parse string and lexize words.
 *
 * prs will be filled in.
 */
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{
  int     type,
        lenlemm = 0;  /* silence compiler warning */
  char     *lemm = NULL;
  LexizeData  ldata;
  TSLexeme   *norms;
  TSConfigCacheEntry *cfg;
  TSParserCacheEntry *prsobj;
  void     *prsdata;

  cfg = lookup_ts_config_cache(cfgId);
  prsobj = lookup_ts_parser_cache(cfg->prsId);

  prsdata = DatumGetPointer(FunctionCall2(&prsobj->prsstart,
                      PointerGetDatum(buf),
                      Int32GetDatum(buflen)));

  LexizeInit(&ldata, cfg);

  do
  {
    type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
                       PointerGetDatum(prsdata),
                       PointerGetDatum(&lemm),
                       PointerGetDatum(&lenlemm)));

    if (type > 0 && lenlemm >= MAXSTRLEN)
    {
#ifdef IGNORE_LONGLEXEME
      ereport(NOTICE,
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
           errmsg("word is too long to be indexed"),
           errdetail("Words longer than %d characters are ignored.",
                 MAXSTRLEN)));
      continue;
#else
      ereport(ERROR,
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
           errmsg("word is too long to be indexed"),
           errdetail("Words longer than %d characters are ignored.",
                 MAXSTRLEN)));
#endif
    }

    LexizeAddLemm(&ldata, type, lemm, lenlemm);

    while ((norms = LexizeExec(&ldata, NULL)) != NULL)
    {
      TSLexeme   *ptr = norms;

      prs->pos++;     /* set pos */

      while (ptr->lexeme)
      {
        if (prs->curwords == prs->lenwords)
        {
          prs->lenwords *= 2;
          prs->words = (ParsedWord *) repalloc(prs->words, prs->lenwords * sizeof(ParsedWord));
        }

        if (ptr->flags & TSL_ADDPOS)
          prs->pos++;
        prs->words[prs->curwords].len = strlen(ptr->lexeme);
        prs->words[prs->curwords].word = ptr->lexeme;
        prs->words[prs->curwords].nvariant = ptr->nvariant;
        prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
        prs->words[prs->curwords].alen = 0;
        prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
        ptr++;
        prs->curwords++;
      }
      pfree(norms);
    }
  } while (type > 0);

  FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}

/*
 * Headline framework
 */

/* Add a word to prs->words[] */
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
  if (prs->curwords >= prs->lenwords)
  {
    prs->lenwords *= 2;
    prs->words = (HeadlineWordEntry *) repalloc(prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
  }
  memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
  prs->words[prs->curwords].type = (uint8) type;
  prs->words[prs->curwords].len = buflen;
  prs->words[prs->curwords].word = palloc(buflen);
  memcpy(prs->words[prs->curwords].word, buf, buflen);
  prs->curwords++;
}

/*
 * Add pos and matching-query-item data to the just-added word.
 * Here, buf/buflen represent a processed lexeme, not raw token text.
 *
 * If the query contains more than one matching item, we replicate
 * the last-added word so that each item can be pointed to.  The
 * duplicate entries are marked with repeated = 1.
 */
static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
  int     i;
  QueryItem  *item = GETQUERY(query);
  HeadlineWordEntry *word;

  while (prs->curwords + query->size >= prs->lenwords)
  {
    prs->lenwords *= 2;
    prs->words = (HeadlineWordEntry *) repalloc(prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
  }

  word = &(prs->words[prs->curwords - 1]);
  word->pos = LIMITPOS(pos);
  for (i = 0; i < query->size; i++)
  {
    if (item->type == QI_VAL &&
      tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
              buf, buflen, item->qoperand.prefix) == 0)
    {
      if (word->item)
      {
        memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
        prs->words[prs->curwords].item = &item->qoperand;
        prs->words[prs->curwords].repeated = 1;
        prs->curwords++;
      }
      else
        word->item = &item->qoperand;
    }
    item++;
  }
}

static void
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{
  ParsedLex  *tmplexs;
  TSLexeme   *ptr;
  int32   savedpos;

  while (lexs)
  {
    if (lexs->type > 0)
      hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);

    ptr = norms;
    savedpos = prs->vectorpos;
    while (ptr && ptr->lexeme)
    {
      if (ptr->flags & TSL_ADDPOS)
        savedpos++;
      hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
      ptr++;
    }

    tmplexs = lexs->next;
    pfree(lexs);
    lexs = tmplexs;
  }

  if (norms)
  {
    ptr = norms;
    while (ptr->lexeme)
    {
      if (ptr->flags & TSL_ADDPOS)
        prs->vectorpos++;
      pfree(ptr->lexeme);
      ptr++;
    }
    pfree(norms);
  }
}

void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
  int     type,
        lenlemm = 0;  /* silence compiler warning */
  char     *lemm = NULL;
  LexizeData  ldata;
  TSLexeme   *norms;
  ParsedLex  *lexs;
  TSConfigCacheEntry *cfg;
  TSParserCacheEntry *prsobj;
  void     *prsdata;

  cfg = lookup_ts_config_cache(cfgId);
  prsobj = lookup_ts_parser_cache(cfg->prsId);

  prsdata = DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
                      PointerGetDatum(buf),
                      Int32GetDatum(buflen)));

  LexizeInit(&ldata, cfg);

  do
  {
    type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
                       PointerGetDatum(prsdata),
                       PointerGetDatum(&lemm),
                       PointerGetDatum(&lenlemm)));

    if (type > 0 && lenlemm >= MAXSTRLEN)
    {
#ifdef IGNORE_LONGLEXEME
      ereport(NOTICE,
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
           errmsg("word is too long to be indexed"),
           errdetail("Words longer than %d characters are ignored.",
                 MAXSTRLEN)));
      continue;
#else
      ereport(ERROR,
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
           errmsg("word is too long to be indexed"),
           errdetail("Words longer than %d characters are ignored.",
                 MAXSTRLEN)));
#endif
    }

    LexizeAddLemm(&ldata, type, lemm, lenlemm);

    do
    {
      if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
      {
        prs->vectorpos++;
        addHLParsedLex(prs, query, lexs, norms);
      }
      else
        addHLParsedLex(prs, query, lexs, NULL);
    } while (norms);
  } while (type > 0);

  FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}

/*
 * Generate the headline, as a text object, from HeadlineParsedText.
 */
text *
generateHeadline(HeadlineParsedText *prs)
{
  text     *out;
  char     *ptr;
  int     len = 128;
  int     numfragments = 0;
  int16   infrag = 0;

  HeadlineWordEntry *wrd = prs->words;

  out = (text *) palloc(len);
  ptr = ((char *) out) + VARHDRSZ;

  while (wrd - prs->words < prs->curwords)
  {
    while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
    {
      int     dist = ptr - ((char *) out);

      len *= 2;
      out = (text *) repalloc(out, len);
      ptr = ((char *) out) + dist;
    }

    if (wrd->in && !wrd->repeated)
    {
      if (!infrag)
      {

        /* start of a new fragment */
        infrag = 1;
        numfragments++;
        /* add a fragment delimiter if this is after the first one */
        if (numfragments > 1)
        {
          memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
          ptr += prs->fragdelimlen;
        }
      }
      if (wrd->replace)
      {
        *ptr = ' ';
        ptr++;
      }
      else if (!wrd->skip)
      {
        if (wrd->selected)
        {
          memcpy(ptr, prs->startsel, prs->startsellen);
          ptr += prs->startsellen;
        }
        memcpy(ptr, wrd->word, wrd->len);
        ptr += wrd->len;
        if (wrd->selected)
        {
          memcpy(ptr, prs->stopsel, prs->stopsellen);
          ptr += prs->stopsellen;
        }
      }
    }
    else if (!wrd->repeated)
    {
      if (infrag)
        infrag = 0;
      pfree(wrd->word);
    }

    wrd++;
  }

  SET_VARSIZE(out, ptr - ((char *) out));
  return out;
}

Coverage Report

Created: 2025-08-12 06:43

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* ts_parse.c
4		* main parse functions for tsearch
5		*
6		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7		*
8		*
9		* IDENTIFICATION
10		* src/backend/tsearch/ts_parse.c
11		*
12		*-------------------------------------------------------------------------
13		*/
14
15		#include "postgres.h"
16
17		#include "tsearch/ts_cache.h"
18		#include "tsearch/ts_utils.h"
19		#include "varatt.h"
20
21		#define IGNORE_LONGLEXEME 1
22
23		/*
24		* Lexize subsystem
25		*/
26
27		typedef struct ParsedLex
28		{
29		int type;
30		char *lemm;
31		int lenlemm;
32		struct ParsedLex *next;
33		} ParsedLex;
34
35		typedef struct ListParsedLex
36		{
37		ParsedLex *head;
38		ParsedLex *tail;
39		} ListParsedLex;
40
41		typedef struct
42		{
43		TSConfigCacheEntry *cfg;
44		Oid curDictId;
45		int posDict;
46		DictSubState dictState;
47		ParsedLex *curSub;
48		ListParsedLex towork; /* current list to work */
49		ListParsedLex waste; /* list of lexemes that already lexized */
50
51		/*
52		* fields to store last variant to lexize (basically, thesaurus or similar
53		* to, which wants several lexemes
54		*/
55
56		ParsedLex *lastRes;
57		TSLexeme *tmpRes;
58		} LexizeData;
59
60		static void
61		LexizeInit(LexizeData ld, TSConfigCacheEntry cfg)
62	0	{
63	0	ld->cfg = cfg;
64	0	ld->curDictId = InvalidOid;
65	0	ld->posDict = 0;
66	0	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
67	0	ld->waste.head = ld->waste.tail = NULL;
68	0	ld->lastRes = NULL;
69	0	ld->tmpRes = NULL;
70	0	}
71
72		static void
73		LPLAddTail(ListParsedLex list, ParsedLex newpl)
74	0	{
75	0	if (list->tail)
76	0	{
77	0	list->tail->next = newpl;
78	0	list->tail = newpl;
79	0	}
80	0	else
81	0	list->head = list->tail = newpl;
82	0	newpl->next = NULL;
83	0	}
84
85		static ParsedLex *
86		LPLRemoveHead(ListParsedLex *list)
87	0	{
88	0	ParsedLex *res = list->head;
89
90	0	if (list->head)
91	0	list->head = list->head->next;
92
93	0	if (list->head == NULL)
94	0	list->tail = NULL;
95
96	0	return res;
97	0	}
98
99		static void
100		LexizeAddLemm(LexizeData ld, int type, char lemm, int lenlemm)
101	0	{
102	0	ParsedLex newpl = (ParsedLex ) palloc(sizeof(ParsedLex));
103
104	0	newpl->type = type;
105	0	newpl->lemm = lemm;
106	0	newpl->lenlemm = lenlemm;
107	0	LPLAddTail(&ld->towork, newpl);
108	0	ld->curSub = ld->towork.tail;
109	0	}
110
111		static void
112		RemoveHead(LexizeData *ld)
113	0	{
114	0	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
115
116	0	ld->posDict = 0;
117	0	}
118
119		static void
120		setCorrLex(LexizeData ld, ParsedLex *correspondLexem)
121	0	{
122	0	if (correspondLexem)
123	0	{
124	0	*correspondLexem = ld->waste.head;
125	0	}
126	0	else
127	0	{
128	0	ParsedLex *tmp,
129	0	*ptr = ld->waste.head;
130
131	0	while (ptr)
132	0	{
133	0	tmp = ptr->next;
134	0	pfree(ptr);
135	0	ptr = tmp;
136	0	}
137	0	}
138	0	ld->waste.head = ld->waste.tail = NULL;
139	0	}
140
141		static void
142		moveToWaste(LexizeData ld, ParsedLex stop)
143	0	{
144	0	bool go = true;
145
146	0	while (ld->towork.head && go)
147	0	{
148	0	if (ld->towork.head == stop)
149	0	{
150	0	ld->curSub = stop->next;
151	0	go = false;
152	0	}
153	0	RemoveHead(ld);
154	0	}
155	0	}
156
157		static void
158		setNewTmpRes(LexizeData ld, ParsedLex lex, TSLexeme *res)
159	0	{
160	0	if (ld->tmpRes)
161	0	{
162	0	TSLexeme *ptr;
163
164	0	for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
165	0	pfree(ptr->lexeme);
166	0	pfree(ld->tmpRes);
167	0	}
168	0	ld->tmpRes = res;
169	0	ld->lastRes = lex;
170	0	}
171
172		static TSLexeme *
173		LexizeExec(LexizeData ld, ParsedLex *correspondLexem)
174	0	{
175	0	int i;
176	0	ListDictionary *map;
177	0	TSDictionaryCacheEntry *dict;
178	0	TSLexeme *res;
179
180	0	if (ld->curDictId == InvalidOid)
181	0	{
182		/*
183		* usual mode: dictionary wants only one word, but we should keep in
184		* mind that we should go through all stack
185		*/
186
187	0	while (ld->towork.head)
188	0	{
189	0	ParsedLex *curVal = ld->towork.head;
190	0	char *curValLemm = curVal->lemm;
191	0	int curValLenLemm = curVal->lenlemm;
192
193	0	map = ld->cfg->map + curVal->type;
194
195	0	if (curVal->type == 0 \|\| curVal->type >= ld->cfg->lenmap \|\| map->len == 0)
196	0	{
197		/* skip this type of lexeme */
198	0	RemoveHead(ld);
199	0	continue;
200	0	}
201
202	0	for (i = ld->posDict; i < map->len; i++)
203	0	{
204	0	dict = lookup_ts_dictionary_cache(map->dictIds[i]);
205
206	0	ld->dictState.isend = ld->dictState.getnext = false;
207	0	ld->dictState.private_state = NULL;
208	0	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
209	0	PointerGetDatum(dict->dictData),
210	0	PointerGetDatum(curValLemm),
211	0	Int32GetDatum(curValLenLemm),
212	0	PointerGetDatum(&ld->dictState)));
213
214	0	if (ld->dictState.getnext)
215	0	{
216		/*
217		* dictionary wants next word, so setup and store current
218		* position and go to multiword mode
219		*/
220
221	0	ld->curDictId = map->dictIds[i];
222	0	ld->posDict = i + 1;
223	0	ld->curSub = curVal->next;
224	0	if (res)
225	0	setNewTmpRes(ld, curVal, res);
226	0	return LexizeExec(ld, correspondLexem);
227	0	}
228
229	0	if (!res) /* dictionary doesn't know this lexeme */
230	0	continue;
231
232	0	if (res->flags & TSL_FILTER)
233	0	{
234	0	curValLemm = res->lexeme;
235	0	curValLenLemm = strlen(res->lexeme);
236	0	continue;
237	0	}
238
239	0	RemoveHead(ld);
240	0	setCorrLex(ld, correspondLexem);
241	0	return res;
242	0	}
243
244	0	RemoveHead(ld);
245	0	}
246	0	}
247	0	else
248	0	{ /* curDictId is valid */
249	0	dict = lookup_ts_dictionary_cache(ld->curDictId);
250
251		/*
252		* Dictionary ld->curDictId asks us about following words
253		*/
254
255	0	while (ld->curSub)
256	0	{
257	0	ParsedLex *curVal = ld->curSub;
258
259	0	map = ld->cfg->map + curVal->type;
260
261	0	if (curVal->type != 0)
262	0	{
263	0	bool dictExists = false;
264
265	0	if (curVal->type >= ld->cfg->lenmap \|\| map->len == 0)
266	0	{
267		/* skip this type of lexeme */
268	0	ld->curSub = curVal->next;
269	0	continue;
270	0	}
271
272		/*
273		* We should be sure that current type of lexeme is recognized
274		* by our dictionary: we just check is it exist in list of
275		* dictionaries ?
276		*/
277	0	for (i = 0; i < map->len && !dictExists; i++)
278	0	if (ld->curDictId == map->dictIds[i])
279	0	dictExists = true;
280
281	0	if (!dictExists)
282	0	{
283		/*
284		* Dictionary can't work with current type of lexeme,
285		* return to basic mode and redo all stored lexemes
286		*/
287	0	ld->curDictId = InvalidOid;
288	0	return LexizeExec(ld, correspondLexem);
289	0	}
290	0	}
291
292	0	ld->dictState.isend = (curVal->type == 0);
293	0	ld->dictState.getnext = false;
294
295	0	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
296	0	PointerGetDatum(dict->dictData),
297	0	PointerGetDatum(curVal->lemm),
298	0	Int32GetDatum(curVal->lenlemm),
299	0	PointerGetDatum(&ld->dictState)));
300
301	0	if (ld->dictState.getnext)
302	0	{
303		/* Dictionary wants one more */
304	0	ld->curSub = curVal->next;
305	0	if (res)
306	0	setNewTmpRes(ld, curVal, res);
307	0	continue;
308	0	}
309
310	0	if (res \|\| ld->tmpRes)
311	0	{
312		/*
313		* Dictionary normalizes lexemes, so we remove from stack all
314		* used lexemes, return to basic mode and redo end of stack
315		* (if it exists)
316		*/
317	0	if (res)
318	0	{
319	0	moveToWaste(ld, ld->curSub);
320	0	}
321	0	else
322	0	{
323	0	res = ld->tmpRes;
324	0	moveToWaste(ld, ld->lastRes);
325	0	}
326
327		/* reset to initial state */
328	0	ld->curDictId = InvalidOid;
329	0	ld->posDict = 0;
330	0	ld->lastRes = NULL;
331	0	ld->tmpRes = NULL;
332	0	setCorrLex(ld, correspondLexem);
333	0	return res;
334	0	}
335
336		/*
337		* Dict don't want next lexem and didn't recognize anything, redo
338		* from ld->towork.head
339		*/
340	0	ld->curDictId = InvalidOid;
341	0	return LexizeExec(ld, correspondLexem);
342	0	}
343	0	}
344
345	0	setCorrLex(ld, correspondLexem);
346	0	return NULL;
347	0	}
348
349		/*
350		* Parse string and lexize words.
351		*
352		* prs will be filled in.
353		*/
354		void
355		parsetext(Oid cfgId, ParsedText prs, char buf, int buflen)
356	0	{
357	0	int type,
358	0	lenlemm = 0; /* silence compiler warning */
359	0	char *lemm = NULL;
360	0	LexizeData ldata;
361	0	TSLexeme *norms;
362	0	TSConfigCacheEntry *cfg;
363	0	TSParserCacheEntry *prsobj;
364	0	void *prsdata;
365
366	0	cfg = lookup_ts_config_cache(cfgId);
367	0	prsobj = lookup_ts_parser_cache(cfg->prsId);
368
369	0	prsdata = DatumGetPointer(FunctionCall2(&prsobj->prsstart,
370	0	PointerGetDatum(buf),
371	0	Int32GetDatum(buflen)));
372
373	0	LexizeInit(&ldata, cfg);
374
375	0	do
376	0	{
377	0	type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
378	0	PointerGetDatum(prsdata),
379	0	PointerGetDatum(&lemm),
380	0	PointerGetDatum(&lenlemm)));
381
382	0	if (type > 0 && lenlemm >= MAXSTRLEN)
383	0	{
384	0	#ifdef IGNORE_LONGLEXEME
385	0	ereport(NOTICE,
386	0	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
387	0	errmsg("word is too long to be indexed"),
388	0	errdetail("Words longer than %d characters are ignored.",
389	0	MAXSTRLEN)));
390	0	continue;
391		#else
392		ereport(ERROR,
393		(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
394		errmsg("word is too long to be indexed"),
395		errdetail("Words longer than %d characters are ignored.",
396		MAXSTRLEN)));
397		#endif
398	0	}
399
400	0	LexizeAddLemm(&ldata, type, lemm, lenlemm);
401
402	0	while ((norms = LexizeExec(&ldata, NULL)) != NULL)
403	0	{
404	0	TSLexeme *ptr = norms;
405
406	0	prs->pos++; /* set pos */
407
408	0	while (ptr->lexeme)
409	0	{
410	0	if (prs->curwords == prs->lenwords)
411	0	{
412	0	prs->lenwords *= 2;
413	0	prs->words = (ParsedWord ) repalloc(prs->words, prs->lenwords sizeof(ParsedWord));
414	0	}
415
416	0	if (ptr->flags & TSL_ADDPOS)
417	0	prs->pos++;
418	0	prs->words[prs->curwords].len = strlen(ptr->lexeme);
419	0	prs->words[prs->curwords].word = ptr->lexeme;
420	0	prs->words[prs->curwords].nvariant = ptr->nvariant;
421	0	prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
422	0	prs->words[prs->curwords].alen = 0;
423	0	prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
424	0	ptr++;
425	0	prs->curwords++;
426	0	}
427	0	pfree(norms);
428	0	}
429	0	} while (type > 0);
430
431	0	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
432	0	}
433
434		/*
435		* Headline framework
436		*/
437
438		/* Add a word to prs->words[] */
439		static void
440		hladdword(HeadlineParsedText prs, char buf, int buflen, int type)
441	0	{
442	0	if (prs->curwords >= prs->lenwords)
443	0	{
444	0	prs->lenwords *= 2;
445	0	prs->words = (HeadlineWordEntry ) repalloc(prs->words, prs->lenwords sizeof(HeadlineWordEntry));
446	0	}
447	0	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
448	0	prs->words[prs->curwords].type = (uint8) type;
449	0	prs->words[prs->curwords].len = buflen;
450	0	prs->words[prs->curwords].word = palloc(buflen);
451	0	memcpy(prs->words[prs->curwords].word, buf, buflen);
452	0	prs->curwords++;
453	0	}
454
455		/*
456		* Add pos and matching-query-item data to the just-added word.
457		* Here, buf/buflen represent a processed lexeme, not raw token text.
458		*
459		* If the query contains more than one matching item, we replicate
460		* the last-added word so that each item can be pointed to. The
461		* duplicate entries are marked with repeated = 1.
462		*/
463		static void
464		hlfinditem(HeadlineParsedText prs, TSQuery query, int32 pos, char buf, int buflen)
465	0	{
466	0	int i;
467	0	QueryItem *item = GETQUERY(query);
468	0	HeadlineWordEntry *word;
469
470	0	while (prs->curwords + query->size >= prs->lenwords)
471	0	{
472	0	prs->lenwords *= 2;
473	0	prs->words = (HeadlineWordEntry ) repalloc(prs->words, prs->lenwords sizeof(HeadlineWordEntry));
474	0	}
475
476	0	word = &(prs->words[prs->curwords - 1]);
477	0	word->pos = LIMITPOS(pos);
478	0	for (i = 0; i < query->size; i++)
479	0	{
480	0	if (item->type == QI_VAL &&
481	0	tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
482	0	buf, buflen, item->qoperand.prefix) == 0)
483	0	{
484	0	if (word->item)
485	0	{
486	0	memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
487	0	prs->words[prs->curwords].item = &item->qoperand;
488	0	prs->words[prs->curwords].repeated = 1;
489	0	prs->curwords++;
490	0	}
491	0	else
492	0	word->item = &item->qoperand;
493	0	}
494	0	item++;
495	0	}
496	0	}
497
498		static void
499		addHLParsedLex(HeadlineParsedText prs, TSQuery query, ParsedLex lexs, TSLexeme *norms)
500	0	{
501	0	ParsedLex *tmplexs;
502	0	TSLexeme *ptr;
503	0	int32 savedpos;
504
505	0	while (lexs)
506	0	{
507	0	if (lexs->type > 0)
508	0	hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
509
510	0	ptr = norms;
511	0	savedpos = prs->vectorpos;
512	0	while (ptr && ptr->lexeme)
513	0	{
514	0	if (ptr->flags & TSL_ADDPOS)
515	0	savedpos++;
516	0	hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
517	0	ptr++;
518	0	}
519
520	0	tmplexs = lexs->next;
521	0	pfree(lexs);
522	0	lexs = tmplexs;
523	0	}
524
525	0	if (norms)
526	0	{
527	0	ptr = norms;
528	0	while (ptr->lexeme)
529	0	{
530	0	if (ptr->flags & TSL_ADDPOS)
531	0	prs->vectorpos++;
532	0	pfree(ptr->lexeme);
533	0	ptr++;
534	0	}
535	0	pfree(norms);
536	0	}
537	0	}
538
539		void
540		hlparsetext(Oid cfgId, HeadlineParsedText prs, TSQuery query, char buf, int buflen)
541	0	{
542	0	int type,
543	0	lenlemm = 0; /* silence compiler warning */
544	0	char *lemm = NULL;
545	0	LexizeData ldata;
546	0	TSLexeme *norms;
547	0	ParsedLex *lexs;
548	0	TSConfigCacheEntry *cfg;
549	0	TSParserCacheEntry *prsobj;
550	0	void *prsdata;
551
552	0	cfg = lookup_ts_config_cache(cfgId);
553	0	prsobj = lookup_ts_parser_cache(cfg->prsId);
554
555	0	prsdata = DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
556	0	PointerGetDatum(buf),
557	0	Int32GetDatum(buflen)));
558
559	0	LexizeInit(&ldata, cfg);
560
561	0	do
562	0	{
563	0	type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
564	0	PointerGetDatum(prsdata),
565	0	PointerGetDatum(&lemm),
566	0	PointerGetDatum(&lenlemm)));
567
568	0	if (type > 0 && lenlemm >= MAXSTRLEN)
569	0	{
570	0	#ifdef IGNORE_LONGLEXEME
571	0	ereport(NOTICE,
572	0	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
573	0	errmsg("word is too long to be indexed"),
574	0	errdetail("Words longer than %d characters are ignored.",
575	0	MAXSTRLEN)));
576	0	continue;
577		#else
578		ereport(ERROR,
579		(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
580		errmsg("word is too long to be indexed"),
581		errdetail("Words longer than %d characters are ignored.",
582		MAXSTRLEN)));
583		#endif
584	0	}
585
586	0	LexizeAddLemm(&ldata, type, lemm, lenlemm);
587
588	0	do
589	0	{
590	0	if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
591	0	{
592	0	prs->vectorpos++;
593	0	addHLParsedLex(prs, query, lexs, norms);
594	0	}
595	0	else
596	0	addHLParsedLex(prs, query, lexs, NULL);
597	0	} while (norms);
598	0	} while (type > 0);
599
600	0	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
601	0	}
602
603		/*
604		* Generate the headline, as a text object, from HeadlineParsedText.
605		*/
606		text *
607		generateHeadline(HeadlineParsedText *prs)
608	0	{
609	0	text *out;
610	0	char *ptr;
611	0	int len = 128;
612	0	int numfragments = 0;
613	0	int16 infrag = 0;
614
615	0	HeadlineWordEntry *wrd = prs->words;
616
617	0	out = (text *) palloc(len);
618	0	ptr = ((char *) out) + VARHDRSZ;
619
620	0	while (wrd - prs->words < prs->curwords)
621	0	{
622	0	while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
623	0	{
624	0	int dist = ptr - ((char *) out);
625
626	0	len *= 2;
627	0	out = (text *) repalloc(out, len);
628	0	ptr = ((char *) out) + dist;
629	0	}
630
631	0	if (wrd->in && !wrd->repeated)
632	0	{
633	0	if (!infrag)
634	0	{
635
636		/* start of a new fragment */
637	0	infrag = 1;
638	0	numfragments++;
639		/* add a fragment delimiter if this is after the first one */
640	0	if (numfragments > 1)
641	0	{
642	0	memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
643	0	ptr += prs->fragdelimlen;
644	0	}
645	0	}
646	0	if (wrd->replace)
647	0	{
648	0	*ptr = ' ';
649	0	ptr++;
650	0	}
651	0	else if (!wrd->skip)
652	0	{
653	0	if (wrd->selected)
654	0	{
655	0	memcpy(ptr, prs->startsel, prs->startsellen);
656	0	ptr += prs->startsellen;
657	0	}
658	0	memcpy(ptr, wrd->word, wrd->len);
659	0	ptr += wrd->len;
660	0	if (wrd->selected)
661	0	{
662	0	memcpy(ptr, prs->stopsel, prs->stopsellen);
663	0	ptr += prs->stopsellen;
664	0	}
665	0	}
666	0	}
667	0	else if (!wrd->repeated)
668	0	{
669	0	if (infrag)
670	0	infrag = 0;
671	0	pfree(wrd->word);
672	0	}
673
674	0	wrd++;
675	0	}
676
677	0	SET_VARSIZE(out, ptr - ((char *) out));
678	0	return out;
679	0	}