/src/wireshark/wsutil/regex.c

Source
/*
 * Wireshark - Network traffic analyzer
 * By Gerald Combs <gerald@wireshark.org>
 * Copyright 1998 Gerald Combs
 *
 * SPDX-License-Identifier: GPL-2.0-or-later
 */

#include "config.h"

#include "regex.h"

#include <wsutil/str_util.h>
#include <pcre2.h>


struct _ws_regex {
    pcre2_code *code;
    char *pattern;
};

#define ERROR_MAXLEN_IN_CODE_UNITS   128

static char *
get_error_msg(int errorcode)
{
    uint8_t *buffer;

    /*
     * We have to provide a buffer and we don't know how long the
     * error message is or even the maximum size. From pcre2api(3):
     *     "None of the messages are very long; a
     *     buffer size of 120 code units is ample."
     */
    /* Code unit = one byte */
    buffer = g_malloc(ERROR_MAXLEN_IN_CODE_UNITS);
    /* Message is returned with a trailing zero. */
    pcre2_get_error_message(errorcode, buffer, ERROR_MAXLEN_IN_CODE_UNITS);
    /* One more at the end for good luck. */
    buffer[ERROR_MAXLEN_IN_CODE_UNITS-1] = '\0';
    return (char*)buffer;
}


static pcre2_code *
compile_pcre2(const char *patt, ssize_t size, char **errmsg, unsigned flags)
{
    pcre2_code *code;
    int errorcode;
    PCRE2_SIZE length;
    PCRE2_SIZE erroroffset;
    uint32_t options = 0;

    if (size < 0)
        length = PCRE2_ZERO_TERMINATED;
    else
        length = (PCRE2_SIZE)size;

    if (flags & WS_REGEX_NEVER_UTF)
        options |= PCRE2_NEVER_UTF;
    if (flags & WS_REGEX_CASELESS)
        options |= PCRE2_CASELESS;
    if (flags & WS_REGEX_ANCHORED)
        options |= PCRE2_ANCHORED;

    /* By default UTF-8 is off. */
    code = pcre2_compile_8((PCRE2_SPTR)patt,
                length,
                options,
                &errorcode,
                &erroroffset,
                NULL);

    if (code == NULL) {
        *errmsg = get_error_msg(errorcode);
        return NULL;
    }

    return code;
}


ws_regex_t *
ws_regex_compile_ex(const char *patt, ssize_t size, char **errmsg, unsigned flags)
{
    ws_return_val_if(!patt, NULL);

    pcre2_code *code = compile_pcre2(patt, size, errmsg, flags);
    if (code == NULL)
        return NULL;

    ws_regex_t *re = g_new(ws_regex_t, 1);
    re->code = code;
    re->pattern = ws_escape_string_len(NULL, patt, size, false);
    return re;
}


ws_regex_t *
ws_regex_compile(const char *patt, char **errmsg)
{
    return ws_regex_compile_ex(patt, -1, errmsg, 0);
}


static bool
match_pcre2(pcre2_code *code, const char *subject, ssize_t subj_length,
                size_t subj_offset, pcre2_match_data *match_data)
{
    PCRE2_SIZE length;
    int rc;

    if (subj_length < 0)
        length = PCRE2_ZERO_TERMINATED;
    else
        length = (PCRE2_SIZE)subj_length;

    rc = pcre2_match(code,
                    (const uint8_t*)subject,
                    length,
                    (PCRE2_SIZE)subj_offset,
                    0,          /* default options */
                    match_data,
                    NULL);

    if (rc < 0) {
        /* No match */
        if (rc != PCRE2_ERROR_NOMATCH) {
            /* Error. Should not happen with UTF-8 disabled. Some huge
             * subject strings could hit some internal limit. */
            char *msg = get_error_msg(rc);
            ws_debug("Unexpected pcre2_match() error: %s.", msg);
            g_free(msg);
        }
        return false;
    }

    /* Matched */
    return true;
}


bool
ws_regex_matches(const ws_regex_t *re, const char *subj)
{
    return ws_regex_matches_length(re, subj, -1);
}


bool
ws_regex_matches_length(const ws_regex_t *re,
                        const char *subj, ssize_t subj_length)
{
    bool matched;
    pcre2_match_data *match_data;

    ws_return_val_if(!re, false);
    ws_return_val_if(!subj, false);

    /* We don't use the matched substring but pcre2_match requires
     * at least one pair of offsets. */
    match_data = pcre2_match_data_create(1, NULL);
    matched = match_pcre2(re->code, subj, subj_length, 0, match_data);
    pcre2_match_data_free(match_data);
    return matched;
}


bool
ws_regex_matches_pos(const ws_regex_t *re,
                        const char *subj, ssize_t subj_length,
                        size_t subj_offset, size_t pos_vect[2])
{
    bool matched;
    pcre2_match_data *match_data;

    ws_return_val_if(!re, false);
    ws_return_val_if(!subj, false);

    match_data = pcre2_match_data_create(1, NULL);
    matched = match_pcre2(re->code, subj, subj_length, subj_offset, match_data);
    if (matched && pos_vect) {
        PCRE2_SIZE *ovect = pcre2_get_ovector_pointer(match_data);
        pos_vect[0] = ovect[0];
        pos_vect[1] = ovect[1];
    }
    pcre2_match_data_free(match_data);
    return matched;
}


void
ws_regex_free(ws_regex_t *re)
{
    pcre2_code_free(re->code);
    g_free(re->pattern);
    g_free(re);
}


const char *
ws_regex_pattern(const ws_regex_t *re)
{
    return re->pattern;
}

Coverage Report

Created: 2026-03-30 07:00

Line	Count	Source
1		/*
2		* Wireshark - Network traffic analyzer
3		* By Gerald Combs <gerald@wireshark.org>
4		* Copyright 1998 Gerald Combs
5		*
6		* SPDX-License-Identifier: GPL-2.0-or-later
7		*/
8
9		#include "config.h"
10
11		#include "regex.h"
12
13		#include <wsutil/str_util.h>
14		#include <pcre2.h>
15
16
17		struct _ws_regex {
18		pcre2_code *code;
19		char *pattern;
20		};
21
22	0	#define ERROR_MAXLEN_IN_CODE_UNITS 128
23
24		static char *
25		get_error_msg(int errorcode)
26	0	{
27	0	uint8_t *buffer;
28
29		/*
30		* We have to provide a buffer and we don't know how long the
31		* error message is or even the maximum size. From pcre2api(3):
32		* "None of the messages are very long; a
33		* buffer size of 120 code units is ample."
34		*/
35		/* Code unit = one byte */
36	0	buffer = g_malloc(ERROR_MAXLEN_IN_CODE_UNITS);
37		/* Message is returned with a trailing zero. */
38	0	pcre2_get_error_message(errorcode, buffer, ERROR_MAXLEN_IN_CODE_UNITS);
39		/* One more at the end for good luck. */
40	0	buffer[ERROR_MAXLEN_IN_CODE_UNITS-1] = '\0';
41	0	return (char*)buffer;
42	0	}
43
44
45		static pcre2_code *
46		compile_pcre2(const char patt, ssize_t size, char *errmsg, unsigned flags)
47	0	{
48	0	pcre2_code *code;
49	0	int errorcode;
50	0	PCRE2_SIZE length;
51	0	PCRE2_SIZE erroroffset;
52	0	uint32_t options = 0;
53
54	0	if (size < 0)
55	0	length = PCRE2_ZERO_TERMINATED;
56	0	else
57	0	length = (PCRE2_SIZE)size;
58
59	0	if (flags & WS_REGEX_NEVER_UTF)
60	0	options \|= PCRE2_NEVER_UTF;
61	0	if (flags & WS_REGEX_CASELESS)
62	0	options \|= PCRE2_CASELESS;
63	0	if (flags & WS_REGEX_ANCHORED)
64	0	options \|= PCRE2_ANCHORED;
65
66		/* By default UTF-8 is off. */
67	0	code = pcre2_compile_8((PCRE2_SPTR)patt,
68	0	length,
69	0	options,
70	0	&errorcode,
71	0	&erroroffset,
72	0	NULL);
73
74	0	if (code == NULL) {
75	0	*errmsg = get_error_msg(errorcode);
76	0	return NULL;
77	0	}
78
79	0	return code;
80	0	}
81
82
83		ws_regex_t *
84		ws_regex_compile_ex(const char patt, ssize_t size, char *errmsg, unsigned flags)
85	0	{
86	0	ws_return_val_if(!patt, NULL);
87
88	0	pcre2_code *code = compile_pcre2(patt, size, errmsg, flags);
89	0	if (code == NULL)
90	0	return NULL;
91
92	0	ws_regex_t *re = g_new(ws_regex_t, 1);
93	0	re->code = code;
94	0	re->pattern = ws_escape_string_len(NULL, patt, size, false);
95	0	return re;
96	0	}
97
98
99		ws_regex_t *
100		ws_regex_compile(const char patt, char *errmsg)
101	0	{
102	0	return ws_regex_compile_ex(patt, -1, errmsg, 0);
103	0	}
104
105
106		static bool
107		match_pcre2(pcre2_code code, const char subject, ssize_t subj_length,
108		size_t subj_offset, pcre2_match_data *match_data)
109	0	{
110	0	PCRE2_SIZE length;
111	0	int rc;
112
113	0	if (subj_length < 0)
114	0	length = PCRE2_ZERO_TERMINATED;
115	0	else
116	0	length = (PCRE2_SIZE)subj_length;
117
118	0	rc = pcre2_match(code,
119	0	(const uint8_t*)subject,
120	0	length,
121	0	(PCRE2_SIZE)subj_offset,
122	0	0, /* default options */
123	0	match_data,
124	0	NULL);
125
126	0	if (rc < 0) {
127		/* No match */
128	0	if (rc != PCRE2_ERROR_NOMATCH) {
129		/* Error. Should not happen with UTF-8 disabled. Some huge
130		* subject strings could hit some internal limit. */
131	0	char *msg = get_error_msg(rc);
132	0	ws_debug("Unexpected pcre2_match() error: %s.", msg);
133	0	g_free(msg);
134	0	}
135	0	return false;
136	0	}
137
138		/* Matched */
139	0	return true;
140	0	}
141
142
143		bool
144		ws_regex_matches(const ws_regex_t re, const char subj)
145	0	{
146	0	return ws_regex_matches_length(re, subj, -1);
147	0	}
148
149
150		bool
151		ws_regex_matches_length(const ws_regex_t *re,
152		const char *subj, ssize_t subj_length)
153	0	{
154	0	bool matched;
155	0	pcre2_match_data *match_data;
156
157	0	ws_return_val_if(!re, false);
158	0	ws_return_val_if(!subj, false);
159
160		/* We don't use the matched substring but pcre2_match requires
161		* at least one pair of offsets. */
162	0	match_data = pcre2_match_data_create(1, NULL);
163	0	matched = match_pcre2(re->code, subj, subj_length, 0, match_data);
164	0	pcre2_match_data_free(match_data);
165	0	return matched;
166	0	}
167
168
169		bool
170		ws_regex_matches_pos(const ws_regex_t *re,
171		const char *subj, ssize_t subj_length,
172		size_t subj_offset, size_t pos_vect[2])
173	0	{
174	0	bool matched;
175	0	pcre2_match_data *match_data;
176
177	0	ws_return_val_if(!re, false);
178	0	ws_return_val_if(!subj, false);
179
180	0	match_data = pcre2_match_data_create(1, NULL);
181	0	matched = match_pcre2(re->code, subj, subj_length, subj_offset, match_data);
182	0	if (matched && pos_vect) {
183	0	PCRE2_SIZE *ovect = pcre2_get_ovector_pointer(match_data);
184	0	pos_vect[0] = ovect[0];
185	0	pos_vect[1] = ovect[1];
186	0	}
187	0	pcre2_match_data_free(match_data);
188	0	return matched;
189	0	}
190
191
192		void
193		ws_regex_free(ws_regex_t *re)
194	0	{
195	0	pcre2_code_free(re->code);
196	0	g_free(re->pattern);
197	0	g_free(re);
198	0	}
199
200
201		const char *
202		ws_regex_pattern(const ws_regex_t *re)
203	0	{
204	0	return re->pattern;
205	0	}