/src/ntp-dev/sntp/libopts/tokenize.c

Source
/** \file tokenize.c
 *
 *  Tokenize a string, accommodating quoted strings.
 *
 * @addtogroup autoopts
 * @{
 */
/*
 *  This file defines the string_tokenize interface
 *  This file is part of AutoOpts, a companion to AutoGen.
 *  AutoOpts is free software.
 *  AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
 *
 *  AutoOpts is available under any one of two licenses.  The license
 *  in use must be one of these two and the choice is under the control
 *  of the user of the license.
 *
 *   The GNU Lesser General Public License, version 3 or later
 *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
 *
 *   The Modified Berkeley Software Distribution License
 *      See the file "COPYING.mbsd"
 *
 *  These files have the following sha256 sums:
 *
 *  8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95  COPYING.gplv3
 *  4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b  COPYING.lgplv3
 *  13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239  COPYING.mbsd
 */

static void
copy_cooked(ch_t ** ppDest, char const ** ppSrc)
{
    ch_t * pDest = (ch_t *)*ppDest;
    const ch_t * pSrc  = (const ch_t *)(*ppSrc + 1);

    for (;;) {
        ch_t ch = *(pSrc++);
        switch (ch) {
        case NUL:   *ppSrc = NULL; return;
        case '"':   goto done;
        case '\\':
            pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F);
            if (ch == 0x7F)
                break;
            /* FALLTHROUGH */

        default:
            *(pDest++) = ch;
        }
    }

 done:
    *ppDest = (ch_t *)pDest; /* next spot for storing character */
    *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
}


static void
copy_raw(ch_t ** ppDest, char const ** ppSrc)
{
    ch_t * pDest = *ppDest;
    cc_t * pSrc  = (cc_t *) (*ppSrc + 1);

    for (;;) {
        ch_t ch = *(pSrc++);
        switch (ch) {
        case NUL:   *ppSrc = NULL; return;
        case '\'':  goto done;
        case '\\':
            /*
             *  *Four* escapes are handled:  newline removal, escape char
             *  quoting and apostrophe quoting
             */
            switch (*pSrc) {
            case NUL:   *ppSrc = NULL; return;
            case '\r':
                if (*(++pSrc) == NL)
                    ++pSrc;
                continue;

            case NL:
                ++pSrc;
                continue;

            case '\'':
                ch = '\'';
                /* FALLTHROUGH */

            case '\\':
                ++pSrc;
                break;
            }
            /* FALLTHROUGH */

        default:
            *(pDest++) = ch;
        }
    }

 done:
    *ppDest = pDest; /* next spot for storing character */
    *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
}

static token_list_t *
alloc_token_list(char const * str)
{
    token_list_t * res;

    int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */

    if (str == NULL) goto enoent_res;

    /*
     *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
     *  an empty string was passed.
     */
    str = SPN_WHITESPACE_CHARS(str);
    if (*str == NUL)  goto enoent_res;

    /*
     *  Take an approximate count of tokens.  If no quoted strings are used,
     *  it will be accurate.  If quoted strings are used, it will be a little
     *  high and we'll squander the space for a few extra pointers.
     */
    {
        char const * pz = str;

        do {
            max_token_ct++;
            pz = BRK_WHITESPACE_CHARS(pz+1);
            pz = SPN_WHITESPACE_CHARS(pz);
        } while (*pz != NUL);

        res = malloc(sizeof(*res) + (size_t)(pz - str)
                     + ((size_t)max_token_ct * sizeof(ch_t *)));
    }

    if (res == NULL)
        errno = ENOMEM;
    else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));

    return res;

    enoent_res:

    errno = ENOENT;
    return NULL;
}

/*=export_func ao_string_tokenize
 *
 * what: tokenize an input string
 *
 * arg:  + char const * + string + string to be tokenized +
 *
 * ret_type:  token_list_t *
 * ret_desc:  pointer to a structure that lists each token
 *
 * doc:
 *
 * This function will convert one input string into a list of strings.
 * The list of strings is derived by separating the input based on
 * white space separation.  However, if the input contains either single
 * or double quote characters, then the text after that character up to
 * a matching quote will become the string in the list.
 *
 *  The returned pointer should be deallocated with @code{free(3C)} when
 *  are done using the data.  The data are placed in a single block of
 *  allocated memory.  Do not deallocate individual token/strings.
 *
 *  The structure pointed to will contain at least these two fields:
 *  @table @samp
 *  @item tkn_ct
 *  The number of tokens found in the input string.
 *  @item tok_list
 *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
 *  the last pointer set to NULL.
 *  @end table
 *
 * There are two types of quoted strings: single quoted (@code{'}) and
 * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
 * escape characters (@code{\\}) are simply another character, except when
 * preceding the following characters:
 * @example
 * @code{\\}  double backslashes reduce to one
 * @code{'}   incorporates the single quote into the string
 * @code{\n}  suppresses both the backslash and newline character
 * @end example
 *
 * Double quote strings are formed according to the rules of string
 * constants in ANSI-C programs.
 *
 * example:
 * @example
 *    #include <stdlib.h>
 *    int ix;
 *    token_list_t * ptl = ao_string_tokenize(some_string)
 *    for (ix = 0; ix < ptl->tkn_ct; ix++)
 *       do_something_with_tkn(ptl->tkn_list[ix]);
 *    free(ptl);
 * @end example
 * Note that everything is freed with the one call to @code{free(3C)}.
 *
 * err:
 *  NULL is returned and @code{errno} will be set to indicate the problem:
 *  @itemize @bullet
 *  @item
 *  @code{EINVAL} - There was an unterminated quoted string.
 *  @item
 *  @code{ENOENT} - The input string was empty.
 *  @item
 *  @code{ENOMEM} - There is not enough memory.
 *  @end itemize
=*/
token_list_t *
ao_string_tokenize(char const * str)
{
    token_list_t * res = alloc_token_list(str);
    ch_t * pzDest;

    /*
     *  Now copy each token into the output buffer.
     */
    if (res == NULL)
        return res;

    pzDest = (ch_t *)(res->tkn_list[0]);
    res->tkn_ct  = 0;

    do  {
        res->tkn_list[ res->tkn_ct++ ] = pzDest;
        for (;;) {
            int ch = (ch_t)*str;
            if (IS_WHITESPACE_CHAR(ch)) {
            found_white_space:
                str = SPN_WHITESPACE_CHARS(str+1);
                break;
            }

            switch (ch) {
            case '"':
                copy_cooked(&pzDest, &str);
                if (str == NULL) {
                    free(res);
                    errno = EINVAL;
                    return NULL;
                }
                if (IS_WHITESPACE_CHAR(*str))
                    goto found_white_space;
                break;

            case '\'':
                copy_raw(&pzDest, &str);
                if (str == NULL) {
                    free(res);
                    errno = EINVAL;
                    return NULL;
                }
                if (IS_WHITESPACE_CHAR(*str))
                    goto found_white_space;
                break;

            case NUL:
                goto copy_done;

            default:
                str++;
                *(pzDest++) = (unsigned char)ch;
            }
        } copy_done:;

        /*
         * NUL terminate the last token and see if we have any more tokens.
         */
        *(pzDest++) = NUL;
    } while (*str != NUL);

    res->tkn_list[ res->tkn_ct ] = NULL;

    return res;
}

#ifdef TEST
#include <stdio.h>
#include <string.h>

int
main(int argc, char ** argv)
{
    if (argc == 1) {
        printf("USAGE:  %s arg [ ... ]\n", *argv);
        return 1;
    }
    while (--argc > 0) {
        char * arg = *(++argv);
        token_list_t * p = ao_string_tokenize(arg);
        if (p == NULL) {
            printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
                   arg, errno, strerror(errno));
        } else {
            int ix = 0;
            printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
            do {
                printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
            } while (++ix < p->tkn_ct);
            free(p);
        }
    }
    return 0;
}
#endif

/** @}
 *
 * Local Variables:
 * mode: C
 * c-file-style: "stroustrup"
 * indent-tabs-mode: nil
 * End:
 * end of autoopts/tokenize.c */

Coverage Report

Created: 2026-02-26 06:20

Line	Count	Source
1		/** \file tokenize.c
2		*
3		* Tokenize a string, accommodating quoted strings.
4		*
5		* @addtogroup autoopts
6		* @{
7		*/
8		/*
9		* This file defines the string_tokenize interface
10		* This file is part of AutoOpts, a companion to AutoGen.
11		* AutoOpts is free software.
12		* AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved
13		*
14		* AutoOpts is available under any one of two licenses. The license
15		* in use must be one of these two and the choice is under the control
16		* of the user of the license.
17		*
18		* The GNU Lesser General Public License, version 3 or later
19		* See the files "COPYING.lgplv3" and "COPYING.gplv3"
20		*
21		* The Modified Berkeley Software Distribution License
22		* See the file "COPYING.mbsd"
23		*
24		* These files have the following sha256 sums:
25		*
26		* 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3
27		* 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3
28		* 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd
29		*/
30
31		static void
32		copy_cooked(ch_t ppDest, char const ppSrc)
33	0	{
34	0	ch_t * pDest = (ch_t )ppDest;
35	0	const ch_t * pSrc = (const ch_t )(ppSrc + 1);
36
37	0	for (;;) {
38	0	ch_t ch = *(pSrc++);
39	0	switch (ch) {
40	0	case NUL: *ppSrc = NULL; return;
41	0	case '"': goto done;
42	0	case '\\':
43	0	pSrc += ao_string_cook_escape_char((char )pSrc, (char )&ch, 0x7F);
44	0	if (ch == 0x7F)
45	0	break;
46		/* FALLTHROUGH */
47
48	0	default:
49	0	*(pDest++) = ch;
50	0	}
51	0	}
52
53	0	done:
54	0	ppDest = (ch_t )pDest; /* next spot for storing character */
55	0	ppSrc = (char const )pSrc; /* char following closing quote */
56	0	}
57
58
59		static void
60		copy_raw(ch_t ppDest, char const ppSrc)
61	0	{
62	0	ch_t * pDest = *ppDest;
63	0	cc_t * pSrc = (cc_t ) (ppSrc + 1);
64
65	0	for (;;) {
66	0	ch_t ch = *(pSrc++);
67	0	switch (ch) {
68	0	case NUL: *ppSrc = NULL; return;
69	0	case '\'': goto done;
70	0	case '\\':
71		/*
72		* Four escapes are handled: newline removal, escape char
73		* quoting and apostrophe quoting
74		*/
75	0	switch (*pSrc) {
76	0	case NUL: *ppSrc = NULL; return;
77	0	case '\r':
78	0	if (*(++pSrc) == NL)
79	0	++pSrc;
80	0	continue;
81
82	0	case NL:
83	0	++pSrc;
84	0	continue;
85
86	0	case '\'':
87	0	ch = '\'';
88		/* FALLTHROUGH */
89
90	0	case '\\':
91	0	++pSrc;
92	0	break;
93	0	}
94		/* FALLTHROUGH */
95
96	0	default:
97	0	*(pDest++) = ch;
98	0	}
99	0	}
100
101	0	done:
102	0	ppDest = pDest; / next spot for storing character */
103	0	ppSrc = (char const ) pSrc; /* char following closing quote */
104	0	}
105
106		static token_list_t *
107		alloc_token_list(char const * str)
108	0	{
109	0	token_list_t * res;
110
111	0	int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
112
113	0	if (str == NULL) goto enoent_res;
114
115		/*
116		* Trim leading white space. Use "ENOENT" and a NULL return to indicate
117		* an empty string was passed.
118		*/
119	0	str = SPN_WHITESPACE_CHARS(str);
120	0	if (*str == NUL) goto enoent_res;
121
122		/*
123		* Take an approximate count of tokens. If no quoted strings are used,
124		* it will be accurate. If quoted strings are used, it will be a little
125		* high and we'll squander the space for a few extra pointers.
126		*/
127	0	{
128	0	char const * pz = str;
129
130	0	do {
131	0	max_token_ct++;
132	0	pz = BRK_WHITESPACE_CHARS(pz+1);
133	0	pz = SPN_WHITESPACE_CHARS(pz);
134	0	} while (*pz != NUL);
135
136	0	res = malloc(sizeof(*res) + (size_t)(pz - str)
137	0	+ ((size_t)max_token_ct * sizeof(ch_t *)));
138	0	}
139
140	0	if (res == NULL)
141	0	errno = ENOMEM;
142	0	else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1));
143
144	0	return res;
145
146	0	enoent_res:
147
148	0	errno = ENOENT;
149	0	return NULL;
150	0	}
151
152		/*=export_func ao_string_tokenize
153		*
154		* what: tokenize an input string
155		*
156		* arg: + char const * + string + string to be tokenized +
157		*
158		* ret_type: token_list_t *
159		* ret_desc: pointer to a structure that lists each token
160		*
161		* doc:
162		*
163		* This function will convert one input string into a list of strings.
164		* The list of strings is derived by separating the input based on
165		* white space separation. However, if the input contains either single
166		* or double quote characters, then the text after that character up to
167		* a matching quote will become the string in the list.
168		*
169		* The returned pointer should be deallocated with @code{free(3C)} when
170		* are done using the data. The data are placed in a single block of
171		* allocated memory. Do not deallocate individual token/strings.
172		*
173		* The structure pointed to will contain at least these two fields:
174		* @table @samp
175		* @item tkn_ct
176		* The number of tokens found in the input string.
177		* @item tok_list
178		* An array of @code{tkn_ct + 1} pointers to substring tokens, with
179		* the last pointer set to NULL.
180		* @end table
181		*
182		* There are two types of quoted strings: single quoted (@code{'}) and
183		* double quoted (@code{"}). Singly quoted strings are fairly raw in that
184		* escape characters (@code{\\}) are simply another character, except when
185		* preceding the following characters:
186		* @example
187		* @code{\\} double backslashes reduce to one
188		* @code{'} incorporates the single quote into the string
189		* @code{\n} suppresses both the backslash and newline character
190		* @end example
191		*
192		* Double quote strings are formed according to the rules of string
193		* constants in ANSI-C programs.
194		*
195		* example:
196		* @example
197		* #include <stdlib.h>
198		* int ix;
199		* token_list_t * ptl = ao_string_tokenize(some_string)
200		* for (ix = 0; ix < ptl->tkn_ct; ix++)
201		* do_something_with_tkn(ptl->tkn_list[ix]);
202		* free(ptl);
203		* @end example
204		* Note that everything is freed with the one call to @code{free(3C)}.
205		*
206		* err:
207		* NULL is returned and @code{errno} will be set to indicate the problem:
208		* @itemize @bullet
209		* @item
210		* @code{EINVAL} - There was an unterminated quoted string.
211		* @item
212		* @code{ENOENT} - The input string was empty.
213		* @item
214		* @code{ENOMEM} - There is not enough memory.
215		* @end itemize
216		=*/
217		token_list_t *
218		ao_string_tokenize(char const * str)
219	0	{
220	0	token_list_t * res = alloc_token_list(str);
221	0	ch_t * pzDest;
222
223		/*
224		* Now copy each token into the output buffer.
225		*/
226	0	if (res == NULL)
227	0	return res;
228
229	0	pzDest = (ch_t *)(res->tkn_list[0]);
230	0	res->tkn_ct = 0;
231
232	0	do {
233	0	res->tkn_list[ res->tkn_ct++ ] = pzDest;
234	0	for (;;) {
235	0	int ch = (ch_t)*str;
236	0	if (IS_WHITESPACE_CHAR(ch)) {
237	0	found_white_space:
238	0	str = SPN_WHITESPACE_CHARS(str+1);
239	0	break;
240	0	}
241
242	0	switch (ch) {
243	0	case '"':
244	0	copy_cooked(&pzDest, &str);
245	0	if (str == NULL) {
246	0	free(res);
247	0	errno = EINVAL;
248	0	return NULL;
249	0	}
250	0	if (IS_WHITESPACE_CHAR(*str))
251	0	goto found_white_space;
252	0	break;
253
254	0	case '\'':
255	0	copy_raw(&pzDest, &str);
256	0	if (str == NULL) {
257	0	free(res);
258	0	errno = EINVAL;
259	0	return NULL;
260	0	}
261	0	if (IS_WHITESPACE_CHAR(*str))
262	0	goto found_white_space;
263	0	break;
264
265	0	case NUL:
266	0	goto copy_done;
267
268	0	default:
269	0	str++;
270	0	*(pzDest++) = (unsigned char)ch;
271	0	}
272	0	} copy_done:;
273
274		/*
275		* NUL terminate the last token and see if we have any more tokens.
276		*/
277	0	*(pzDest++) = NUL;
278	0	} while (*str != NUL);
279
280	0	res->tkn_list[ res->tkn_ct ] = NULL;
281
282	0	return res;
283	0	}
284
285		#ifdef TEST
286		#include <stdio.h>
287		#include <string.h>
288
289		int
290		main(int argc, char ** argv)
291		{
292		if (argc == 1) {
293		printf("USAGE: %s arg [ ... ]\n", *argv);
294		return 1;
295		}
296		while (--argc > 0) {
297		char * arg = *(++argv);
298		token_list_t * p = ao_string_tokenize(arg);
299		if (p == NULL) {
300		printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
301		arg, errno, strerror(errno));
302		} else {
303		int ix = 0;
304		printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
305		do {
306		printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]);
307		} while (++ix < p->tkn_ct);
308		free(p);
309		}
310		}
311		return 0;
312		}
313		#endif
314
315		/** @}
316		*
317		* Local Variables:
318		* mode: C
319		* c-file-style: "stroustrup"
320		* indent-tabs-mode: nil
321		* End:
322		* end of autoopts/tokenize.c */