/src/ntp-dev/sntp/libopts/tokenize.c
Line | Count | Source |
1 | | /** \file tokenize.c |
2 | | * |
3 | | * Tokenize a string, accommodating quoted strings. |
4 | | * |
5 | | * @addtogroup autoopts |
6 | | * @{ |
7 | | */ |
8 | | /* |
9 | | * This file defines the string_tokenize interface |
10 | | * This file is part of AutoOpts, a companion to AutoGen. |
11 | | * AutoOpts is free software. |
12 | | * AutoOpts is Copyright (C) 1992-2018 by Bruce Korb - all rights reserved |
13 | | * |
14 | | * AutoOpts is available under any one of two licenses. The license |
15 | | * in use must be one of these two and the choice is under the control |
16 | | * of the user of the license. |
17 | | * |
18 | | * The GNU Lesser General Public License, version 3 or later |
19 | | * See the files "COPYING.lgplv3" and "COPYING.gplv3" |
20 | | * |
21 | | * The Modified Berkeley Software Distribution License |
22 | | * See the file "COPYING.mbsd" |
23 | | * |
24 | | * These files have the following sha256 sums: |
25 | | * |
26 | | * 8584710e9b04216a394078dc156b781d0b47e1729104d666658aecef8ee32e95 COPYING.gplv3 |
27 | | * 4379e7444a0e2ce2b12dd6f5a52a27a4d02d39d247901d3285c88cf0d37f477b COPYING.lgplv3 |
28 | | * 13aa749a5b0a454917a944ed8fffc530b784f5ead522b1aacaf4ec8aa55a6239 COPYING.mbsd |
29 | | */ |
30 | | |
31 | | static void |
32 | | copy_cooked(ch_t ** ppDest, char const ** ppSrc) |
33 | 0 | { |
34 | 0 | ch_t * pDest = (ch_t *)*ppDest; |
35 | 0 | const ch_t * pSrc = (const ch_t *)(*ppSrc + 1); |
36 | |
|
37 | 0 | for (;;) { |
38 | 0 | ch_t ch = *(pSrc++); |
39 | 0 | switch (ch) { |
40 | 0 | case NUL: *ppSrc = NULL; return; |
41 | 0 | case '"': goto done; |
42 | 0 | case '\\': |
43 | 0 | pSrc += ao_string_cook_escape_char((char *)pSrc, (char *)&ch, 0x7F); |
44 | 0 | if (ch == 0x7F) |
45 | 0 | break; |
46 | | /* FALLTHROUGH */ |
47 | | |
48 | 0 | default: |
49 | 0 | *(pDest++) = ch; |
50 | 0 | } |
51 | 0 | } |
52 | | |
53 | 0 | done: |
54 | 0 | *ppDest = (ch_t *)pDest; /* next spot for storing character */ |
55 | 0 | *ppSrc = (char const *)pSrc; /* char following closing quote */ |
56 | 0 | } |
57 | | |
58 | | |
59 | | static void |
60 | | copy_raw(ch_t ** ppDest, char const ** ppSrc) |
61 | 0 | { |
62 | 0 | ch_t * pDest = *ppDest; |
63 | 0 | cc_t * pSrc = (cc_t *) (*ppSrc + 1); |
64 | |
|
65 | 0 | for (;;) { |
66 | 0 | ch_t ch = *(pSrc++); |
67 | 0 | switch (ch) { |
68 | 0 | case NUL: *ppSrc = NULL; return; |
69 | 0 | case '\'': goto done; |
70 | 0 | case '\\': |
71 | | /* |
72 | | * *Four* escapes are handled: newline removal, escape char |
73 | | * quoting and apostrophe quoting |
74 | | */ |
75 | 0 | switch (*pSrc) { |
76 | 0 | case NUL: *ppSrc = NULL; return; |
77 | 0 | case '\r': |
78 | 0 | if (*(++pSrc) == NL) |
79 | 0 | ++pSrc; |
80 | 0 | continue; |
81 | | |
82 | 0 | case NL: |
83 | 0 | ++pSrc; |
84 | 0 | continue; |
85 | | |
86 | 0 | case '\'': |
87 | 0 | ch = '\''; |
88 | | /* FALLTHROUGH */ |
89 | |
|
90 | 0 | case '\\': |
91 | 0 | ++pSrc; |
92 | 0 | break; |
93 | 0 | } |
94 | | /* FALLTHROUGH */ |
95 | | |
96 | 0 | default: |
97 | 0 | *(pDest++) = ch; |
98 | 0 | } |
99 | 0 | } |
100 | | |
101 | 0 | done: |
102 | 0 | *ppDest = pDest; /* next spot for storing character */ |
103 | 0 | *ppSrc = (char const *) pSrc; /* char following closing quote */ |
104 | 0 | } |
105 | | |
106 | | static token_list_t * |
107 | | alloc_token_list(char const * str) |
108 | 0 | { |
109 | 0 | token_list_t * res; |
110 | |
|
111 | 0 | int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */ |
112 | |
|
113 | 0 | if (str == NULL) goto enoent_res; |
114 | | |
115 | | /* |
116 | | * Trim leading white space. Use "ENOENT" and a NULL return to indicate |
117 | | * an empty string was passed. |
118 | | */ |
119 | 0 | str = SPN_WHITESPACE_CHARS(str); |
120 | 0 | if (*str == NUL) goto enoent_res; |
121 | | |
122 | | /* |
123 | | * Take an approximate count of tokens. If no quoted strings are used, |
124 | | * it will be accurate. If quoted strings are used, it will be a little |
125 | | * high and we'll squander the space for a few extra pointers. |
126 | | */ |
127 | 0 | { |
128 | 0 | char const * pz = str; |
129 | |
|
130 | 0 | do { |
131 | 0 | max_token_ct++; |
132 | 0 | pz = BRK_WHITESPACE_CHARS(pz+1); |
133 | 0 | pz = SPN_WHITESPACE_CHARS(pz); |
134 | 0 | } while (*pz != NUL); |
135 | |
|
136 | 0 | res = malloc(sizeof(*res) + (size_t)(pz - str) |
137 | 0 | + ((size_t)max_token_ct * sizeof(ch_t *))); |
138 | 0 | } |
139 | |
|
140 | 0 | if (res == NULL) |
141 | 0 | errno = ENOMEM; |
142 | 0 | else res->tkn_list[0] = (ch_t *)(res->tkn_list + (max_token_ct - 1)); |
143 | |
|
144 | 0 | return res; |
145 | | |
146 | 0 | enoent_res: |
147 | |
|
148 | 0 | errno = ENOENT; |
149 | 0 | return NULL; |
150 | 0 | } |
151 | | |
152 | | /*=export_func ao_string_tokenize |
153 | | * |
154 | | * what: tokenize an input string |
155 | | * |
156 | | * arg: + char const * + string + string to be tokenized + |
157 | | * |
158 | | * ret_type: token_list_t * |
159 | | * ret_desc: pointer to a structure that lists each token |
160 | | * |
161 | | * doc: |
162 | | * |
163 | | * This function will convert one input string into a list of strings. |
164 | | * The list of strings is derived by separating the input based on |
165 | | * white space separation. However, if the input contains either single |
166 | | * or double quote characters, then the text after that character up to |
167 | | * a matching quote will become the string in the list. |
168 | | * |
169 | | * The returned pointer should be deallocated with @code{free(3C)} when |
170 | | * are done using the data. The data are placed in a single block of |
171 | | * allocated memory. Do not deallocate individual token/strings. |
172 | | * |
173 | | * The structure pointed to will contain at least these two fields: |
174 | | * @table @samp |
175 | | * @item tkn_ct |
176 | | * The number of tokens found in the input string. |
177 | | * @item tok_list |
178 | | * An array of @code{tkn_ct + 1} pointers to substring tokens, with |
179 | | * the last pointer set to NULL. |
180 | | * @end table |
181 | | * |
182 | | * There are two types of quoted strings: single quoted (@code{'}) and |
183 | | * double quoted (@code{"}). Singly quoted strings are fairly raw in that |
184 | | * escape characters (@code{\\}) are simply another character, except when |
185 | | * preceding the following characters: |
186 | | * @example |
187 | | * @code{\\} double backslashes reduce to one |
188 | | * @code{'} incorporates the single quote into the string |
189 | | * @code{\n} suppresses both the backslash and newline character |
190 | | * @end example |
191 | | * |
192 | | * Double quote strings are formed according to the rules of string |
193 | | * constants in ANSI-C programs. |
194 | | * |
195 | | * example: |
196 | | * @example |
197 | | * #include <stdlib.h> |
198 | | * int ix; |
199 | | * token_list_t * ptl = ao_string_tokenize(some_string) |
200 | | * for (ix = 0; ix < ptl->tkn_ct; ix++) |
201 | | * do_something_with_tkn(ptl->tkn_list[ix]); |
202 | | * free(ptl); |
203 | | * @end example |
204 | | * Note that everything is freed with the one call to @code{free(3C)}. |
205 | | * |
206 | | * err: |
207 | | * NULL is returned and @code{errno} will be set to indicate the problem: |
208 | | * @itemize @bullet |
209 | | * @item |
210 | | * @code{EINVAL} - There was an unterminated quoted string. |
211 | | * @item |
212 | | * @code{ENOENT} - The input string was empty. |
213 | | * @item |
214 | | * @code{ENOMEM} - There is not enough memory. |
215 | | * @end itemize |
216 | | =*/ |
217 | | token_list_t * |
218 | | ao_string_tokenize(char const * str) |
219 | 0 | { |
220 | 0 | token_list_t * res = alloc_token_list(str); |
221 | 0 | ch_t * pzDest; |
222 | | |
223 | | /* |
224 | | * Now copy each token into the output buffer. |
225 | | */ |
226 | 0 | if (res == NULL) |
227 | 0 | return res; |
228 | | |
229 | 0 | pzDest = (ch_t *)(res->tkn_list[0]); |
230 | 0 | res->tkn_ct = 0; |
231 | |
|
232 | 0 | do { |
233 | 0 | res->tkn_list[ res->tkn_ct++ ] = pzDest; |
234 | 0 | for (;;) { |
235 | 0 | int ch = (ch_t)*str; |
236 | 0 | if (IS_WHITESPACE_CHAR(ch)) { |
237 | 0 | found_white_space: |
238 | 0 | str = SPN_WHITESPACE_CHARS(str+1); |
239 | 0 | break; |
240 | 0 | } |
241 | | |
242 | 0 | switch (ch) { |
243 | 0 | case '"': |
244 | 0 | copy_cooked(&pzDest, &str); |
245 | 0 | if (str == NULL) { |
246 | 0 | free(res); |
247 | 0 | errno = EINVAL; |
248 | 0 | return NULL; |
249 | 0 | } |
250 | 0 | if (IS_WHITESPACE_CHAR(*str)) |
251 | 0 | goto found_white_space; |
252 | 0 | break; |
253 | | |
254 | 0 | case '\'': |
255 | 0 | copy_raw(&pzDest, &str); |
256 | 0 | if (str == NULL) { |
257 | 0 | free(res); |
258 | 0 | errno = EINVAL; |
259 | 0 | return NULL; |
260 | 0 | } |
261 | 0 | if (IS_WHITESPACE_CHAR(*str)) |
262 | 0 | goto found_white_space; |
263 | 0 | break; |
264 | | |
265 | 0 | case NUL: |
266 | 0 | goto copy_done; |
267 | | |
268 | 0 | default: |
269 | 0 | str++; |
270 | 0 | *(pzDest++) = (unsigned char)ch; |
271 | 0 | } |
272 | 0 | } copy_done:; |
273 | | |
274 | | /* |
275 | | * NUL terminate the last token and see if we have any more tokens. |
276 | | */ |
277 | 0 | *(pzDest++) = NUL; |
278 | 0 | } while (*str != NUL); |
279 | | |
280 | 0 | res->tkn_list[ res->tkn_ct ] = NULL; |
281 | |
|
282 | 0 | return res; |
283 | 0 | } |
284 | | |
285 | | #ifdef TEST |
286 | | #include <stdio.h> |
287 | | #include <string.h> |
288 | | |
289 | | int |
290 | | main(int argc, char ** argv) |
291 | | { |
292 | | if (argc == 1) { |
293 | | printf("USAGE: %s arg [ ... ]\n", *argv); |
294 | | return 1; |
295 | | } |
296 | | while (--argc > 0) { |
297 | | char * arg = *(++argv); |
298 | | token_list_t * p = ao_string_tokenize(arg); |
299 | | if (p == NULL) { |
300 | | printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n", |
301 | | arg, errno, strerror(errno)); |
302 | | } else { |
303 | | int ix = 0; |
304 | | printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct); |
305 | | do { |
306 | | printf(" %3d: ``%s''\n", ix+1, p->tkn_list[ix]); |
307 | | } while (++ix < p->tkn_ct); |
308 | | free(p); |
309 | | } |
310 | | } |
311 | | return 0; |
312 | | } |
313 | | #endif |
314 | | |
315 | | /** @} |
316 | | * |
317 | | * Local Variables: |
318 | | * mode: C |
319 | | * c-file-style: "stroustrup" |
320 | | * indent-tabs-mode: nil |
321 | | * End: |
322 | | * end of autoopts/tokenize.c */ |