/src/postgres/src/backend/tsearch/to_tsany.c

Source
/*-------------------------------------------------------------------------
 *
 * to_tsany.c
 *    to_ts* function definitions
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *    src/backend/tsearch/to_tsany.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/jsonfuncs.h"


/*
 * Opaque data structure, which is passed by parse_tsquery() to pushval_morph().
 */
typedef struct MorphOpaque
{
  Oid     cfg_id;

  /*
   * Single tsquery morph could be parsed into multiple words.  When these
   * words reside in adjacent positions, they are connected using this
   * operator.  Usually, that is OP_PHRASE, which requires word positions of
   * a complex morph to exactly match the tsvector.
   */
  int     qoperator;
} MorphOpaque;

typedef struct TSVectorBuildState
{
  ParsedText *prs;
  Oid     cfgId;
} TSVectorBuildState;

static void add_to_tsvector(void *_state, char *elem_value, int elem_len);


Datum
get_current_ts_config(PG_FUNCTION_ARGS)
{
  PG_RETURN_OID(getTSCurrentConfig(true));
}

/*
 * to_tsvector
 */
static int
compareWORD(const void *a, const void *b)
{
  int     res;

  res = tsCompareString(((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
              ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
              false);

  if (res == 0)
  {
    if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
      return 0;

    res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
  }

  return res;
}

static int
uniqueWORD(ParsedWord *a, int32 l)
{
  ParsedWord *ptr,
         *res;
  int     tmppos;

  if (l == 1)
  {
    tmppos = LIMITPOS(a->pos.pos);
    a->alen = 2;
    a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
    a->pos.apos[0] = 1;
    a->pos.apos[1] = tmppos;
    return l;
  }

  res = a;
  ptr = a + 1;

  /*
   * Sort words with its positions
   */
  qsort(a, l, sizeof(ParsedWord), compareWORD);

  /*
   * Initialize first word and its first position
   */
  tmppos = LIMITPOS(a->pos.pos);
  a->alen = 2;
  a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
  a->pos.apos[0] = 1;
  a->pos.apos[1] = tmppos;

  /*
   * Summarize position information for each word
   */
  while (ptr - a < l)
  {
    if (!(ptr->len == res->len &&
        strncmp(ptr->word, res->word, res->len) == 0))
    {
      /*
       * Got a new word, so put it in result
       */
      res++;
      res->len = ptr->len;
      res->word = ptr->word;
      tmppos = LIMITPOS(ptr->pos.pos);
      res->alen = 2;
      res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
      res->pos.apos[0] = 1;
      res->pos.apos[1] = tmppos;
    }
    else
    {
      /*
       * The word already exists, so adjust position information. But
       * before we should check size of position's array, max allowed
       * value for position and uniqueness of position
       */
      pfree(ptr->word);
      if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
        res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
      {
        if (res->pos.apos[0] + 1 >= res->alen)
        {
          res->alen *= 2;
          res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
        }
        if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
        {
          res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
          res->pos.apos[0]++;
        }
      }
    }
    ptr++;
  }

  return res + 1 - a;
}

/*
 * make value of tsvector, given parsed text
 *
 * Note: frees prs->words and subsidiary data.
 */
TSVector
make_tsvector(ParsedText *prs)
{
  int     i,
        j,
        lenstr = 0,
        totallen;
  TSVector  in;
  WordEntry  *ptr;
  char     *str;
  int     stroff;

  /* Merge duplicate words */
  if (prs->curwords > 0)
    prs->curwords = uniqueWORD(prs->words, prs->curwords);

  /* Determine space needed */
  for (i = 0; i < prs->curwords; i++)
  {
    lenstr += prs->words[i].len;
    if (prs->words[i].alen)
    {
      lenstr = SHORTALIGN(lenstr);
      lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
    }
  }

  if (lenstr > MAXSTRPOS)
    ereport(ERROR,
        (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
         errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));

  totallen = CALCDATASIZE(prs->curwords, lenstr);
  in = (TSVector) palloc0(totallen);
  SET_VARSIZE(in, totallen);
  in->size = prs->curwords;

  ptr = ARRPTR(in);
  str = STRPTR(in);
  stroff = 0;
  for (i = 0; i < prs->curwords; i++)
  {
    ptr->len = prs->words[i].len;
    ptr->pos = stroff;
    memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
    stroff += prs->words[i].len;
    pfree(prs->words[i].word);
    if (prs->words[i].alen)
    {
      int     k = prs->words[i].pos.apos[0];
      WordEntryPos *wptr;

      if (k > 0xFFFF)
        elog(ERROR, "positions array too long");

      ptr->haspos = 1;
      stroff = SHORTALIGN(stroff);
      *(uint16 *) (str + stroff) = (uint16) k;
      wptr = POSDATAPTR(in, ptr);
      for (j = 0; j < k; j++)
      {
        WEP_SETWEIGHT(wptr[j], 0);
        WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
      }
      stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
      pfree(prs->words[i].pos.apos);
    }
    else
      ptr->haspos = 0;
    ptr++;
  }

  if (prs->words)
    pfree(prs->words);

  return in;
}

Datum
to_tsvector_byid(PG_FUNCTION_ARGS)
{
  Oid     cfgId = PG_GETARG_OID(0);
  text     *in = PG_GETARG_TEXT_PP(1);
  ParsedText  prs;
  TSVector  out;

  prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
                         * number */
  if (prs.lenwords < 2)
    prs.lenwords = 2;
  else if (prs.lenwords > MaxAllocSize / sizeof(ParsedWord))
    prs.lenwords = MaxAllocSize / sizeof(ParsedWord);
  prs.curwords = 0;
  prs.pos = 0;
  prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);

  parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));

  PG_FREE_IF_COPY(in, 1);

  out = make_tsvector(&prs);

  PG_RETURN_TSVECTOR(out);
}

Datum
to_tsvector(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(0);
  Oid     cfgId;

  cfgId = getTSCurrentConfig(true);
  PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
                    ObjectIdGetDatum(cfgId),
                    PointerGetDatum(in)));
}

/*
 * Worker function for jsonb(_string)_to_tsvector(_byid)
 */
static TSVector
jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
{
  TSVectorBuildState state;
  ParsedText  prs;

  prs.words = NULL;
  prs.curwords = 0;
  state.prs = &prs;
  state.cfgId = cfgId;

  iterate_jsonb_values(jb, flags, &state, add_to_tsvector);

  return make_tsvector(&prs);
}

Datum
jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
{
  Oid     cfgId = PG_GETARG_OID(0);
  Jsonb    *jb = PG_GETARG_JSONB_P(1);
  TSVector  result;

  result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
  PG_FREE_IF_COPY(jb, 1);

  PG_RETURN_TSVECTOR(result);
}

Datum
jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
{
  Jsonb    *jb = PG_GETARG_JSONB_P(0);
  Oid     cfgId;
  TSVector  result;

  cfgId = getTSCurrentConfig(true);
  result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
  PG_FREE_IF_COPY(jb, 0);

  PG_RETURN_TSVECTOR(result);
}

Datum
jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
{
  Oid     cfgId = PG_GETARG_OID(0);
  Jsonb    *jb = PG_GETARG_JSONB_P(1);
  Jsonb    *jbFlags = PG_GETARG_JSONB_P(2);
  TSVector  result;
  uint32    flags = parse_jsonb_index_flags(jbFlags);

  result = jsonb_to_tsvector_worker(cfgId, jb, flags);
  PG_FREE_IF_COPY(jb, 1);
  PG_FREE_IF_COPY(jbFlags, 2);

  PG_RETURN_TSVECTOR(result);
}

Datum
jsonb_to_tsvector(PG_FUNCTION_ARGS)
{
  Jsonb    *jb = PG_GETARG_JSONB_P(0);
  Jsonb    *jbFlags = PG_GETARG_JSONB_P(1);
  Oid     cfgId;
  TSVector  result;
  uint32    flags = parse_jsonb_index_flags(jbFlags);

  cfgId = getTSCurrentConfig(true);
  result = jsonb_to_tsvector_worker(cfgId, jb, flags);
  PG_FREE_IF_COPY(jb, 0);
  PG_FREE_IF_COPY(jbFlags, 1);

  PG_RETURN_TSVECTOR(result);
}

/*
 * Worker function for json(_string)_to_tsvector(_byid)
 */
static TSVector
json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
{
  TSVectorBuildState state;
  ParsedText  prs;

  prs.words = NULL;
  prs.curwords = 0;
  state.prs = &prs;
  state.cfgId = cfgId;

  iterate_json_values(json, flags, &state, add_to_tsvector);

  return make_tsvector(&prs);
}

Datum
json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
{
  Oid     cfgId = PG_GETARG_OID(0);
  text     *json = PG_GETARG_TEXT_P(1);
  TSVector  result;

  result = json_to_tsvector_worker(cfgId, json, jtiString);
  PG_FREE_IF_COPY(json, 1);

  PG_RETURN_TSVECTOR(result);
}

Datum
json_string_to_tsvector(PG_FUNCTION_ARGS)
{
  text     *json = PG_GETARG_TEXT_P(0);
  Oid     cfgId;
  TSVector  result;

  cfgId = getTSCurrentConfig(true);
  result = json_to_tsvector_worker(cfgId, json, jtiString);
  PG_FREE_IF_COPY(json, 0);

  PG_RETURN_TSVECTOR(result);
}

Datum
json_to_tsvector_byid(PG_FUNCTION_ARGS)
{
  Oid     cfgId = PG_GETARG_OID(0);
  text     *json = PG_GETARG_TEXT_P(1);
  Jsonb    *jbFlags = PG_GETARG_JSONB_P(2);
  TSVector  result;
  uint32    flags = parse_jsonb_index_flags(jbFlags);

  result = json_to_tsvector_worker(cfgId, json, flags);
  PG_FREE_IF_COPY(json, 1);
  PG_FREE_IF_COPY(jbFlags, 2);

  PG_RETURN_TSVECTOR(result);
}

Datum
json_to_tsvector(PG_FUNCTION_ARGS)
{
  text     *json = PG_GETARG_TEXT_P(0);
  Jsonb    *jbFlags = PG_GETARG_JSONB_P(1);
  Oid     cfgId;
  TSVector  result;
  uint32    flags = parse_jsonb_index_flags(jbFlags);

  cfgId = getTSCurrentConfig(true);
  result = json_to_tsvector_worker(cfgId, json, flags);
  PG_FREE_IF_COPY(json, 0);
  PG_FREE_IF_COPY(jbFlags, 1);

  PG_RETURN_TSVECTOR(result);
}

/*
 * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
 */
static void
add_to_tsvector(void *_state, char *elem_value, int elem_len)
{
  TSVectorBuildState *state = (TSVectorBuildState *) _state;
  ParsedText *prs = state->prs;
  int32   prevwords;

  if (prs->words == NULL)
  {
    /*
     * First time through: initialize words array to a reasonable size.
     * (parsetext() will realloc it bigger as needed.)
     */
    prs->lenwords = 16;
    prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
    prs->curwords = 0;
    prs->pos = 0;
  }

  prevwords = prs->curwords;

  parsetext(state->cfgId, prs, elem_value, elem_len);

  /*
   * If we extracted any words from this JSON element, advance pos to create
   * an artificial break between elements.  This is because we don't want
   * phrase searches to think that the last word in this element is adjacent
   * to the first word in the next one.
   */
  if (prs->curwords > prevwords)
    prs->pos += 1;
}


/*
 * to_tsquery
 */


/*
 * This function is used for morph parsing.
 *
 * The value is passed to parsetext which will call the right dictionary to
 * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
 * to the stack.
 *
 * All words belonging to the same variant are pushed as an ANDed list,
 * and different variants are ORed together.
 */
static void
pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
{
  int32   count = 0;
  ParsedText  prs;
  uint32    variant,
        pos = 0,
        cntvar = 0,
        cntpos = 0,
        cnt = 0;
  MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);

  prs.lenwords = 4;
  prs.curwords = 0;
  prs.pos = 0;
  prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);

  parsetext(data->cfg_id, &prs, strval, lenval);

  if (prs.curwords > 0)
  {
    while (count < prs.curwords)
    {
      /*
       * Were any stop words removed? If so, fill empty positions with
       * placeholders linked by an appropriate operator.
       */
      if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
      {
        while (pos + 1 < prs.words[count].pos.pos)
        {
          /* put placeholders for each missing stop word */
          pushStop(state);
          if (cntpos)
            pushOperator(state, data->qoperator, 1);
          cntpos++;
          pos++;
        }
      }

      /* save current word's position */
      pos = prs.words[count].pos.pos;

      /* Go through all variants obtained from this token */
      cntvar = 0;
      while (count < prs.curwords && pos == prs.words[count].pos.pos)
      {
        variant = prs.words[count].nvariant;

        /* Push all words belonging to the same variant */
        cnt = 0;
        while (count < prs.curwords &&
             pos == prs.words[count].pos.pos &&
             variant == prs.words[count].nvariant)
        {
          pushValue(state,
                prs.words[count].word,
                prs.words[count].len,
                weight,
                ((prs.words[count].flags & TSL_PREFIX) || prefix));
          pfree(prs.words[count].word);
          if (cnt)
            pushOperator(state, OP_AND, 0);
          cnt++;
          count++;
        }

        if (cntvar)
          pushOperator(state, OP_OR, 0);
        cntvar++;
      }

      if (cntpos)
      {
        /* distance may be useful */
        pushOperator(state, data->qoperator, 1);
      }

      cntpos++;
    }

    pfree(prs.words);
  }
  else
    pushStop(state);
}

Datum
to_tsquery_byid(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(1);
  TSQuery   query;
  MorphOpaque data;

  data.cfg_id = PG_GETARG_OID(0);

  /*
   * Passing OP_PHRASE as a qoperator makes tsquery require matching of word
   * positions of a complex morph exactly match the tsvector.  Also, when
   * the complex morphs are connected with OP_PHRASE operator, we connect
   * all their words into the OP_PHRASE sequence.
   */
  data.qoperator = OP_PHRASE;

  query = parse_tsquery(text_to_cstring(in),
              pushval_morph,
              PointerGetDatum(&data),
              0,
              NULL);

  PG_RETURN_TSQUERY(query);
}

Datum
to_tsquery(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(0);
  Oid     cfgId;

  cfgId = getTSCurrentConfig(true);
  PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
                    ObjectIdGetDatum(cfgId),
                    PointerGetDatum(in)));
}

Datum
plainto_tsquery_byid(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(1);
  TSQuery   query;
  MorphOpaque data;

  data.cfg_id = PG_GETARG_OID(0);

  /*
   * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
   * single morph.  Passing OP_PHRASE as a qoperator makes tsquery require
   * matching of all words independently on their positions.
   */
  data.qoperator = OP_AND;

  query = parse_tsquery(text_to_cstring(in),
              pushval_morph,
              PointerGetDatum(&data),
              P_TSQ_PLAIN,
              NULL);

  PG_RETURN_POINTER(query);
}

Datum
plainto_tsquery(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(0);
  Oid     cfgId;

  cfgId = getTSCurrentConfig(true);
  PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
                    ObjectIdGetDatum(cfgId),
                    PointerGetDatum(in)));
}


Datum
phraseto_tsquery_byid(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(1);
  TSQuery   query;
  MorphOpaque data;

  data.cfg_id = PG_GETARG_OID(0);

  /*
   * parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
   * single morph.  Passing OP_PHRASE as a qoperator makes tsquery require
   * matching of word positions.
   */
  data.qoperator = OP_PHRASE;

  query = parse_tsquery(text_to_cstring(in),
              pushval_morph,
              PointerGetDatum(&data),
              P_TSQ_PLAIN,
              NULL);

  PG_RETURN_TSQUERY(query);
}

Datum
phraseto_tsquery(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(0);
  Oid     cfgId;

  cfgId = getTSCurrentConfig(true);
  PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
                    ObjectIdGetDatum(cfgId),
                    PointerGetDatum(in)));
}

Datum
websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(1);
  MorphOpaque data;
  TSQuery   query = NULL;

  data.cfg_id = PG_GETARG_OID(0);

  /*
   * Passing OP_PHRASE as a qoperator makes tsquery require matching of word
   * positions of a complex morph exactly match the tsvector.  Also, when
   * the complex morphs are given in quotes, we connect all their words into
   * the OP_PHRASE sequence.
   */
  data.qoperator = OP_PHRASE;

  query = parse_tsquery(text_to_cstring(in),
              pushval_morph,
              PointerGetDatum(&data),
              P_TSQ_WEB,
              NULL);

  PG_RETURN_TSQUERY(query);
}

Datum
websearch_to_tsquery(PG_FUNCTION_ARGS)
{
  text     *in = PG_GETARG_TEXT_PP(0);
  Oid     cfgId;

  cfgId = getTSCurrentConfig(true);
  PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
                    ObjectIdGetDatum(cfgId),
                    PointerGetDatum(in)));
}

Coverage Report

Created: 2025-09-27 06:52

Line	Count	Source
1		/*-------------------------------------------------------------------------
2		*
3		* to_tsany.c
4		* to_ts* function definitions
5		*
6		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7		*
8		*
9		* IDENTIFICATION
10		* src/backend/tsearch/to_tsany.c
11		*
12		*-------------------------------------------------------------------------
13		*/
14		#include "postgres.h"
15
16		#include "tsearch/ts_cache.h"
17		#include "tsearch/ts_utils.h"
18		#include "utils/builtins.h"
19		#include "utils/jsonfuncs.h"
20
21
22		/*
23		* Opaque data structure, which is passed by parse_tsquery() to pushval_morph().
24		*/
25		typedef struct MorphOpaque
26		{
27		Oid cfg_id;
28
29		/*
30		* Single tsquery morph could be parsed into multiple words. When these
31		* words reside in adjacent positions, they are connected using this
32		* operator. Usually, that is OP_PHRASE, which requires word positions of
33		* a complex morph to exactly match the tsvector.
34		*/
35		int qoperator;
36		} MorphOpaque;
37
38		typedef struct TSVectorBuildState
39		{
40		ParsedText *prs;
41		Oid cfgId;
42		} TSVectorBuildState;
43
44		static void add_to_tsvector(void _state, char elem_value, int elem_len);
45
46
47		Datum
48		get_current_ts_config(PG_FUNCTION_ARGS)
49	0	{
50	0	PG_RETURN_OID(getTSCurrentConfig(true));
51	0	}
52
53		/*
54		* to_tsvector
55		*/
56		static int
57		compareWORD(const void a, const void b)
58	0	{
59	0	int res;
60
61	0	res = tsCompareString(((const ParsedWord ) a)->word, ((const ParsedWord ) a)->len,
62	0	((const ParsedWord ) b)->word, ((const ParsedWord ) b)->len,
63	0	false);
64
65	0	if (res == 0)
66	0	{
67	0	if (((const ParsedWord ) a)->pos.pos == ((const ParsedWord ) b)->pos.pos)
68	0	return 0;
69
70	0	res = (((const ParsedWord ) a)->pos.pos > ((const ParsedWord ) b)->pos.pos) ? 1 : -1;
71	0	}
72
73	0	return res;
74	0	}
75
76		static int
77		uniqueWORD(ParsedWord *a, int32 l)
78	0	{
79	0	ParsedWord *ptr,
80	0	*res;
81	0	int tmppos;
82
83	0	if (l == 1)
84	0	{
85	0	tmppos = LIMITPOS(a->pos.pos);
86	0	a->alen = 2;
87	0	a->pos.apos = (uint16 ) palloc(sizeof(uint16) a->alen);
88	0	a->pos.apos[0] = 1;
89	0	a->pos.apos[1] = tmppos;
90	0	return l;
91	0	}
92
93	0	res = a;
94	0	ptr = a + 1;
95
96		/*
97		* Sort words with its positions
98		*/
99	0	qsort(a, l, sizeof(ParsedWord), compareWORD);
100
101		/*
102		* Initialize first word and its first position
103		*/
104	0	tmppos = LIMITPOS(a->pos.pos);
105	0	a->alen = 2;
106	0	a->pos.apos = (uint16 ) palloc(sizeof(uint16) a->alen);
107	0	a->pos.apos[0] = 1;
108	0	a->pos.apos[1] = tmppos;
109
110		/*
111		* Summarize position information for each word
112		*/
113	0	while (ptr - a < l)
114	0	{
115	0	if (!(ptr->len == res->len &&
116	0	strncmp(ptr->word, res->word, res->len) == 0))
117	0	{
118		/*
119		* Got a new word, so put it in result
120		*/
121	0	res++;
122	0	res->len = ptr->len;
123	0	res->word = ptr->word;
124	0	tmppos = LIMITPOS(ptr->pos.pos);
125	0	res->alen = 2;
126	0	res->pos.apos = (uint16 ) palloc(sizeof(uint16) res->alen);
127	0	res->pos.apos[0] = 1;
128	0	res->pos.apos[1] = tmppos;
129	0	}
130	0	else
131	0	{
132		/*
133		* The word already exists, so adjust position information. But
134		* before we should check size of position's array, max allowed
135		* value for position and uniqueness of position
136		*/
137	0	pfree(ptr->word);
138	0	if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
139	0	res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
140	0	{
141	0	if (res->pos.apos[0] + 1 >= res->alen)
142	0	{
143	0	res->alen *= 2;
144	0	res->pos.apos = (uint16 ) repalloc(res->pos.apos, sizeof(uint16) res->alen);
145	0	}
146	0	if (res->pos.apos[0] == 0 \|\| res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
147	0	{
148	0	res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
149	0	res->pos.apos[0]++;
150	0	}
151	0	}
152	0	}
153	0	ptr++;
154	0	}
155
156	0	return res + 1 - a;
157	0	}
158
159		/*
160		* make value of tsvector, given parsed text
161		*
162		* Note: frees prs->words and subsidiary data.
163		*/
164		TSVector
165		make_tsvector(ParsedText *prs)
166	0	{
167	0	int i,
168	0	j,
169	0	lenstr = 0,
170	0	totallen;
171	0	TSVector in;
172	0	WordEntry *ptr;
173	0	char *str;
174	0	int stroff;
175
176		/* Merge duplicate words */
177	0	if (prs->curwords > 0)
178	0	prs->curwords = uniqueWORD(prs->words, prs->curwords);
179
180		/* Determine space needed */
181	0	for (i = 0; i < prs->curwords; i++)
182	0	{
183	0	lenstr += prs->words[i].len;
184	0	if (prs->words[i].alen)
185	0	{
186	0	lenstr = SHORTALIGN(lenstr);
187	0	lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
188	0	}
189	0	}
190
191	0	if (lenstr > MAXSTRPOS)
192	0	ereport(ERROR,
193	0	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
194	0	errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
195
196	0	totallen = CALCDATASIZE(prs->curwords, lenstr);
197	0	in = (TSVector) palloc0(totallen);
198	0	SET_VARSIZE(in, totallen);
199	0	in->size = prs->curwords;
200
201	0	ptr = ARRPTR(in);
202	0	str = STRPTR(in);
203	0	stroff = 0;
204	0	for (i = 0; i < prs->curwords; i++)
205	0	{
206	0	ptr->len = prs->words[i].len;
207	0	ptr->pos = stroff;
208	0	memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
209	0	stroff += prs->words[i].len;
210	0	pfree(prs->words[i].word);
211	0	if (prs->words[i].alen)
212	0	{
213	0	int k = prs->words[i].pos.apos[0];
214	0	WordEntryPos *wptr;
215
216	0	if (k > 0xFFFF)
217	0	elog(ERROR, "positions array too long");
218
219	0	ptr->haspos = 1;
220	0	stroff = SHORTALIGN(stroff);
221	0	(uint16 ) (str + stroff) = (uint16) k;
222	0	wptr = POSDATAPTR(in, ptr);
223	0	for (j = 0; j < k; j++)
224	0	{
225	0	WEP_SETWEIGHT(wptr[j], 0);
226	0	WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
227	0	}
228	0	stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
229	0	pfree(prs->words[i].pos.apos);
230	0	}
231	0	else
232	0	ptr->haspos = 0;
233	0	ptr++;
234	0	}
235
236	0	if (prs->words)
237	0	pfree(prs->words);
238
239	0	return in;
240	0	}
241
242		Datum
243		to_tsvector_byid(PG_FUNCTION_ARGS)
244	0	{
245	0	Oid cfgId = PG_GETARG_OID(0);
246	0	text *in = PG_GETARG_TEXT_PP(1);
247	0	ParsedText prs;
248	0	TSVector out;
249
250	0	prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's
251		* number */
252	0	if (prs.lenwords < 2)
253	0	prs.lenwords = 2;
254	0	else if (prs.lenwords > MaxAllocSize / sizeof(ParsedWord))
255	0	prs.lenwords = MaxAllocSize / sizeof(ParsedWord);
256	0	prs.curwords = 0;
257	0	prs.pos = 0;
258	0	prs.words = (ParsedWord ) palloc(sizeof(ParsedWord) prs.lenwords);
259
260	0	parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
261
262	0	PG_FREE_IF_COPY(in, 1);
263
264	0	out = make_tsvector(&prs);
265
266	0	PG_RETURN_TSVECTOR(out);
267	0	}
268
269		Datum
270		to_tsvector(PG_FUNCTION_ARGS)
271	0	{
272	0	text *in = PG_GETARG_TEXT_PP(0);
273	0	Oid cfgId;
274
275	0	cfgId = getTSCurrentConfig(true);
276	0	PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
277	0	ObjectIdGetDatum(cfgId),
278	0	PointerGetDatum(in)));
279	0	}
280
281		/*
282		* Worker function for jsonb(_string)_to_tsvector(_byid)
283		*/
284		static TSVector
285		jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
286	0	{
287	0	TSVectorBuildState state;
288	0	ParsedText prs;
289
290	0	prs.words = NULL;
291	0	prs.curwords = 0;
292	0	state.prs = &prs;
293	0	state.cfgId = cfgId;
294
295	0	iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
296
297	0	return make_tsvector(&prs);
298	0	}
299
300		Datum
301		jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
302	0	{
303	0	Oid cfgId = PG_GETARG_OID(0);
304	0	Jsonb *jb = PG_GETARG_JSONB_P(1);
305	0	TSVector result;
306
307	0	result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
308	0	PG_FREE_IF_COPY(jb, 1);
309
310	0	PG_RETURN_TSVECTOR(result);
311	0	}
312
313		Datum
314		jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
315	0	{
316	0	Jsonb *jb = PG_GETARG_JSONB_P(0);
317	0	Oid cfgId;
318	0	TSVector result;
319
320	0	cfgId = getTSCurrentConfig(true);
321	0	result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
322	0	PG_FREE_IF_COPY(jb, 0);
323
324	0	PG_RETURN_TSVECTOR(result);
325	0	}
326
327		Datum
328		jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
329	0	{
330	0	Oid cfgId = PG_GETARG_OID(0);
331	0	Jsonb *jb = PG_GETARG_JSONB_P(1);
332	0	Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
333	0	TSVector result;
334	0	uint32 flags = parse_jsonb_index_flags(jbFlags);
335
336	0	result = jsonb_to_tsvector_worker(cfgId, jb, flags);
337	0	PG_FREE_IF_COPY(jb, 1);
338	0	PG_FREE_IF_COPY(jbFlags, 2);
339
340	0	PG_RETURN_TSVECTOR(result);
341	0	}
342
343		Datum
344		jsonb_to_tsvector(PG_FUNCTION_ARGS)
345	0	{
346	0	Jsonb *jb = PG_GETARG_JSONB_P(0);
347	0	Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
348	0	Oid cfgId;
349	0	TSVector result;
350	0	uint32 flags = parse_jsonb_index_flags(jbFlags);
351
352	0	cfgId = getTSCurrentConfig(true);
353	0	result = jsonb_to_tsvector_worker(cfgId, jb, flags);
354	0	PG_FREE_IF_COPY(jb, 0);
355	0	PG_FREE_IF_COPY(jbFlags, 1);
356
357	0	PG_RETURN_TSVECTOR(result);
358	0	}
359
360		/*
361		* Worker function for json(_string)_to_tsvector(_byid)
362		*/
363		static TSVector
364		json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
365	0	{
366	0	TSVectorBuildState state;
367	0	ParsedText prs;
368
369	0	prs.words = NULL;
370	0	prs.curwords = 0;
371	0	state.prs = &prs;
372	0	state.cfgId = cfgId;
373
374	0	iterate_json_values(json, flags, &state, add_to_tsvector);
375
376	0	return make_tsvector(&prs);
377	0	}
378
379		Datum
380		json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
381	0	{
382	0	Oid cfgId = PG_GETARG_OID(0);
383	0	text *json = PG_GETARG_TEXT_P(1);
384	0	TSVector result;
385
386	0	result = json_to_tsvector_worker(cfgId, json, jtiString);
387	0	PG_FREE_IF_COPY(json, 1);
388
389	0	PG_RETURN_TSVECTOR(result);
390	0	}
391
392		Datum
393		json_string_to_tsvector(PG_FUNCTION_ARGS)
394	0	{
395	0	text *json = PG_GETARG_TEXT_P(0);
396	0	Oid cfgId;
397	0	TSVector result;
398
399	0	cfgId = getTSCurrentConfig(true);
400	0	result = json_to_tsvector_worker(cfgId, json, jtiString);
401	0	PG_FREE_IF_COPY(json, 0);
402
403	0	PG_RETURN_TSVECTOR(result);
404	0	}
405
406		Datum
407		json_to_tsvector_byid(PG_FUNCTION_ARGS)
408	0	{
409	0	Oid cfgId = PG_GETARG_OID(0);
410	0	text *json = PG_GETARG_TEXT_P(1);
411	0	Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
412	0	TSVector result;
413	0	uint32 flags = parse_jsonb_index_flags(jbFlags);
414
415	0	result = json_to_tsvector_worker(cfgId, json, flags);
416	0	PG_FREE_IF_COPY(json, 1);
417	0	PG_FREE_IF_COPY(jbFlags, 2);
418
419	0	PG_RETURN_TSVECTOR(result);
420	0	}
421
422		Datum
423		json_to_tsvector(PG_FUNCTION_ARGS)
424	0	{
425	0	text *json = PG_GETARG_TEXT_P(0);
426	0	Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
427	0	Oid cfgId;
428	0	TSVector result;
429	0	uint32 flags = parse_jsonb_index_flags(jbFlags);
430
431	0	cfgId = getTSCurrentConfig(true);
432	0	result = json_to_tsvector_worker(cfgId, json, flags);
433	0	PG_FREE_IF_COPY(json, 0);
434	0	PG_FREE_IF_COPY(jbFlags, 1);
435
436	0	PG_RETURN_TSVECTOR(result);
437	0	}
438
439		/*
440		* Parse lexemes in an element of a json(b) value, add to TSVectorBuildState.
441		*/
442		static void
443		add_to_tsvector(void _state, char elem_value, int elem_len)
444	0	{
445	0	TSVectorBuildState state = (TSVectorBuildState ) _state;
446	0	ParsedText *prs = state->prs;
447	0	int32 prevwords;
448
449	0	if (prs->words == NULL)
450	0	{
451		/*
452		* First time through: initialize words array to a reasonable size.
453		* (parsetext() will realloc it bigger as needed.)
454		*/
455	0	prs->lenwords = 16;
456	0	prs->words = (ParsedWord ) palloc(sizeof(ParsedWord) prs->lenwords);
457	0	prs->curwords = 0;
458	0	prs->pos = 0;
459	0	}
460
461	0	prevwords = prs->curwords;
462
463	0	parsetext(state->cfgId, prs, elem_value, elem_len);
464
465		/*
466		* If we extracted any words from this JSON element, advance pos to create
467		* an artificial break between elements. This is because we don't want
468		* phrase searches to think that the last word in this element is adjacent
469		* to the first word in the next one.
470		*/
471	0	if (prs->curwords > prevwords)
472	0	prs->pos += 1;
473	0	}
474
475
476		/*
477		* to_tsquery
478		*/
479
480
481		/*
482		* This function is used for morph parsing.
483		*
484		* The value is passed to parsetext which will call the right dictionary to
485		* lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
486		* to the stack.
487		*
488		* All words belonging to the same variant are pushed as an ANDed list,
489		* and different variants are ORed together.
490		*/
491		static void
492		pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
493	0	{
494	0	int32 count = 0;
495	0	ParsedText prs;
496	0	uint32 variant,
497	0	pos = 0,
498	0	cntvar = 0,
499	0	cntpos = 0,
500	0	cnt = 0;
501	0	MorphOpaque data = (MorphOpaque ) DatumGetPointer(opaque);
502
503	0	prs.lenwords = 4;
504	0	prs.curwords = 0;
505	0	prs.pos = 0;
506	0	prs.words = (ParsedWord ) palloc(sizeof(ParsedWord) prs.lenwords);
507
508	0	parsetext(data->cfg_id, &prs, strval, lenval);
509
510	0	if (prs.curwords > 0)
511	0	{
512	0	while (count < prs.curwords)
513	0	{
514		/*
515		* Were any stop words removed? If so, fill empty positions with
516		* placeholders linked by an appropriate operator.
517		*/
518	0	if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
519	0	{
520	0	while (pos + 1 < prs.words[count].pos.pos)
521	0	{
522		/* put placeholders for each missing stop word */
523	0	pushStop(state);
524	0	if (cntpos)
525	0	pushOperator(state, data->qoperator, 1);
526	0	cntpos++;
527	0	pos++;
528	0	}
529	0	}
530
531		/* save current word's position */
532	0	pos = prs.words[count].pos.pos;
533
534		/* Go through all variants obtained from this token */
535	0	cntvar = 0;
536	0	while (count < prs.curwords && pos == prs.words[count].pos.pos)
537	0	{
538	0	variant = prs.words[count].nvariant;
539
540		/* Push all words belonging to the same variant */
541	0	cnt = 0;
542	0	while (count < prs.curwords &&
543	0	pos == prs.words[count].pos.pos &&
544	0	variant == prs.words[count].nvariant)
545	0	{
546	0	pushValue(state,
547	0	prs.words[count].word,
548	0	prs.words[count].len,
549	0	weight,
550	0	((prs.words[count].flags & TSL_PREFIX) \|\| prefix));
551	0	pfree(prs.words[count].word);
552	0	if (cnt)
553	0	pushOperator(state, OP_AND, 0);
554	0	cnt++;
555	0	count++;
556	0	}
557
558	0	if (cntvar)
559	0	pushOperator(state, OP_OR, 0);
560	0	cntvar++;
561	0	}
562
563	0	if (cntpos)
564	0	{
565		/* distance may be useful */
566	0	pushOperator(state, data->qoperator, 1);
567	0	}
568
569	0	cntpos++;
570	0	}
571
572	0	pfree(prs.words);
573	0	}
574	0	else
575	0	pushStop(state);
576	0	}
577
578		Datum
579		to_tsquery_byid(PG_FUNCTION_ARGS)
580	0	{
581	0	text *in = PG_GETARG_TEXT_PP(1);
582	0	TSQuery query;
583	0	MorphOpaque data;
584
585	0	data.cfg_id = PG_GETARG_OID(0);
586
587		/*
588		* Passing OP_PHRASE as a qoperator makes tsquery require matching of word
589		* positions of a complex morph exactly match the tsvector. Also, when
590		* the complex morphs are connected with OP_PHRASE operator, we connect
591		* all their words into the OP_PHRASE sequence.
592		*/
593	0	data.qoperator = OP_PHRASE;
594
595	0	query = parse_tsquery(text_to_cstring(in),
596	0	pushval_morph,
597	0	PointerGetDatum(&data),
598	0	0,
599	0	NULL);
600
601	0	PG_RETURN_TSQUERY(query);
602	0	}
603
604		Datum
605		to_tsquery(PG_FUNCTION_ARGS)
606	0	{
607	0	text *in = PG_GETARG_TEXT_PP(0);
608	0	Oid cfgId;
609
610	0	cfgId = getTSCurrentConfig(true);
611	0	PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
612	0	ObjectIdGetDatum(cfgId),
613	0	PointerGetDatum(in)));
614	0	}
615
616		Datum
617		plainto_tsquery_byid(PG_FUNCTION_ARGS)
618	0	{
619	0	text *in = PG_GETARG_TEXT_PP(1);
620	0	TSQuery query;
621	0	MorphOpaque data;
622
623	0	data.cfg_id = PG_GETARG_OID(0);
624
625		/*
626		* parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
627		* single morph. Passing OP_PHRASE as a qoperator makes tsquery require
628		* matching of all words independently on their positions.
629		*/
630	0	data.qoperator = OP_AND;
631
632	0	query = parse_tsquery(text_to_cstring(in),
633	0	pushval_morph,
634	0	PointerGetDatum(&data),
635	0	P_TSQ_PLAIN,
636	0	NULL);
637
638	0	PG_RETURN_POINTER(query);
639	0	}
640
641		Datum
642		plainto_tsquery(PG_FUNCTION_ARGS)
643	0	{
644	0	text *in = PG_GETARG_TEXT_PP(0);
645	0	Oid cfgId;
646
647	0	cfgId = getTSCurrentConfig(true);
648	0	PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
649	0	ObjectIdGetDatum(cfgId),
650	0	PointerGetDatum(in)));
651	0	}
652
653
654		Datum
655		phraseto_tsquery_byid(PG_FUNCTION_ARGS)
656	0	{
657	0	text *in = PG_GETARG_TEXT_PP(1);
658	0	TSQuery query;
659	0	MorphOpaque data;
660
661	0	data.cfg_id = PG_GETARG_OID(0);
662
663		/*
664		* parse_tsquery() with P_TSQ_PLAIN flag takes the whole input text as a
665		* single morph. Passing OP_PHRASE as a qoperator makes tsquery require
666		* matching of word positions.
667		*/
668	0	data.qoperator = OP_PHRASE;
669
670	0	query = parse_tsquery(text_to_cstring(in),
671	0	pushval_morph,
672	0	PointerGetDatum(&data),
673	0	P_TSQ_PLAIN,
674	0	NULL);
675
676	0	PG_RETURN_TSQUERY(query);
677	0	}
678
679		Datum
680		phraseto_tsquery(PG_FUNCTION_ARGS)
681	0	{
682	0	text *in = PG_GETARG_TEXT_PP(0);
683	0	Oid cfgId;
684
685	0	cfgId = getTSCurrentConfig(true);
686	0	PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
687	0	ObjectIdGetDatum(cfgId),
688	0	PointerGetDatum(in)));
689	0	}
690
691		Datum
692		websearch_to_tsquery_byid(PG_FUNCTION_ARGS)
693	0	{
694	0	text *in = PG_GETARG_TEXT_PP(1);
695	0	MorphOpaque data;
696	0	TSQuery query = NULL;
697
698	0	data.cfg_id = PG_GETARG_OID(0);
699
700		/*
701		* Passing OP_PHRASE as a qoperator makes tsquery require matching of word
702		* positions of a complex morph exactly match the tsvector. Also, when
703		* the complex morphs are given in quotes, we connect all their words into
704		* the OP_PHRASE sequence.
705		*/
706	0	data.qoperator = OP_PHRASE;
707
708	0	query = parse_tsquery(text_to_cstring(in),
709	0	pushval_morph,
710	0	PointerGetDatum(&data),
711	0	P_TSQ_WEB,
712	0	NULL);
713
714	0	PG_RETURN_TSQUERY(query);
715	0	}
716
717		Datum
718		websearch_to_tsquery(PG_FUNCTION_ARGS)
719	0	{
720	0	text *in = PG_GETARG_TEXT_PP(0);
721	0	Oid cfgId;
722
723	0	cfgId = getTSCurrentConfig(true);
724	0	PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid,
725	0	ObjectIdGetDatum(cfgId),
726	0	PointerGetDatum(in)));
727	0	}