/src/wget2/libwget/css.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2012 Tim Ruehsen
 * Copyright (c) 2015-2023 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * css parsing routines
 *
 * Changelog
 * 03.07.2012  Tim Ruehsen  created
 *
 * A parser using the flex tokenizer, created with flex tokens from
 *   https://www.w3.org/TR/css3-syntax/
 *
 * TODO:
 *  - since we are just interested in @import ... and url(...), we could use
 *    a simplistic hand-written parser which might be much smaller and faster
 */

#include <config.h>

#include <stddef.h>
#include <unistd.h>
#include <string.h>
#include <c-ctype.h>
#include <fcntl.h>
#include <sys/stat.h>
#ifdef HAVE_MMAP
#include <sys/mman.h>
#endif

#include <wget.h>
#include "private.h"

#include "css_tokenizer.h"

// see css_tokenizer.c
typedef void* yyscan_t;
int yyget_leng(yyscan_t yyscanner);
char *yyget_text(yyscan_t yyscanner);
typedef struct yy_buffer_state *YY_BUFFER_STATE;
int yylex_init(yyscan_t* scanner);
YY_BUFFER_STATE yy_scan_string(const char * yystr, yyscan_t yyscanner);
YY_BUFFER_STATE yy_scan_bytes(const char * yystr, int len, yyscan_t yyscanner);
int yylex(yyscan_t yyscanner);
int yylex_destroy(yyscan_t yyscanner);
void *yyalloc(size_t size);
void *yyrealloc(void *p, size_t size);

void *yyalloc(size_t size) {
  return wget_malloc(size);
}
void *yyrealloc(void *p, size_t size) {
  return wget_realloc(p, size);
}

void wget_css_parse_buffer(
  const char *buf,
  size_t len,
  wget_css_parse_uri_callback *callback_uri,
  wget_css_parse_encoding_callback *callback_encoding,
  void *user_ctx)
{
  int token;
  size_t length, pos = 0;
  char *text;
  yyscan_t scanner;

  yylex_init(&scanner);
  yy_scan_bytes(buf, (int) len, scanner);

  while ((token = yylex(scanner)) != CSSEOF) {
    if (token == IMPORT_SYM) {
      // e.g. @import "https://example.com/index.html"
      pos += yyget_leng(scanner);

      // skip whitespace before URI/STRING
      while ((token = yylex(scanner)) == S)
        pos += yyget_leng(scanner);

      // now token should be STRING or URI
      if (token == STRING)
        token = URI;
    }

    if (token == URI && callback_uri) {
      // e.g. url(https://example.com/index.html)
      text = yyget_text(scanner);
      length = yyget_leng(scanner);

      if (*text == '\'' || *text == '\"') {
        // a string - remove the quotes
        callback_uri(user_ctx, text + 1, length - 2, pos + 1);
      } else {
        // extract URI from url(...)
        if (!wget_strncasecmp_ascii(text, "url(", 4)) {
          char *otext = text;

          // remove trailing ) and any spaces before
          for (length--; c_isspace(text[length - 1]); length--);

          // remove leading url( and any spaces after
          for (length -= 4, text += 4; length && c_isspace(*text); text++, length--);

          // remove quotes
          if (length && (*text == '\'' || *text == '\"')) {
            text++;
            length--;
          }

          if (length && (text[length - 1] == '\'' || text[length - 1] == '\"'))
            length--;

          callback_uri(user_ctx, text, length, pos + (text - otext));
        }
      }
    } else if (token == CHARSET_SYM && callback_encoding) {
      // e.g. @charset "UTF-8"
      pos += yyget_leng(scanner);

      // skip whitespace before charset name
      while ((token = yylex(scanner)) == S)
        pos += yyget_leng(scanner);

      // now token should be STRING
      if (token == STRING) {
        text = yyget_text(scanner);
        length = yyget_leng(scanner);

        if (*text == '\'' || *text == '\"') {
          // a string - remove the quotes
          callback_encoding(user_ctx, text + 1, length - 2);
        } else {
          // a string without quotes
          callback_encoding(user_ctx, text, length);
        }
      } else {
        error_printf(_("Unknown token after @charset: %d\n"), token);
      }
    }
    pos += yyget_leng(scanner);
  }

  yylex_destroy(scanner);
}

void wget_css_parse_file(
  const char *fname,
  wget_css_parse_uri_callback *callback_uri,
  wget_css_parse_encoding_callback *callback_encoding,
  void *user_ctx)
{
  if (strcmp(fname,"-")) {
    int fd;

    if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
      struct stat st;
      if (fstat(fd, &st) == 0) {
#ifdef HAVE_MMAP
        size_t nread = st.st_size;
        char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
#else
        char *buf=wget_malloc(st.st_size+1);
        size_t nread=read(fd,buf,st.st_size);
#endif

        if (nread > 0) {
          buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
          wget_css_parse_buffer(buf, st.st_size, callback_uri, callback_encoding, user_ctx);
        }

#ifdef HAVE_MMAP
        munmap(buf, nread);
#else
        xfree(buf);
#endif
      }
      close(fd);
    } else
      error_printf(_("Failed to open %s\n"), fname);
  } else {
    // read data from STDIN.
    // maybe should use yy_scan_bytes instead of buffering into memory.
    char tmp[4096];
    ssize_t nbytes;
    wget_buffer buf;

    wget_buffer_init(&buf, NULL, 4096);

    while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
      wget_buffer_memcat(&buf, tmp, nbytes);
    }

    if (buf.length)
      wget_css_parse_buffer(buf.data, buf.length, callback_uri, callback_encoding, user_ctx);

    wget_buffer_deinit(&buf);
  }
}

Coverage Report

Created: 2023-11-19 06:49

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2012 Tim Ruehsen
3		* Copyright (c) 2015-2023 Free Software Foundation, Inc.
4		*
5		* This file is part of libwget.
6		*
7		* Libwget is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as published by
9		* the Free Software Foundation, either version 3 of the License, or
10		* (at your option) any later version.
11		*
12		* Libwget is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libwget. If not, see <https://www.gnu.org/licenses/>.
19		*
20		*
21		* css parsing routines
22		*
23		* Changelog
24		* 03.07.2012 Tim Ruehsen created
25		*
26		* A parser using the flex tokenizer, created with flex tokens from
27		* https://www.w3.org/TR/css3-syntax/
28		*
29		* TODO:
30		* - since we are just interested in @import ... and url(...), we could use
31		* a simplistic hand-written parser which might be much smaller and faster
32		*/
33
34		#include <config.h>
35
36		#include <stddef.h>
37		#include <unistd.h>
38		#include <string.h>
39		#include <c-ctype.h>
40		#include <fcntl.h>
41		#include <sys/stat.h>
42		#ifdef HAVE_MMAP
43		#include <sys/mman.h>
44		#endif
45
46		#include <wget.h>
47		#include "private.h"
48
49		#include "css_tokenizer.h"
50
51		// see css_tokenizer.c
52		typedef void* yyscan_t;
53		int yyget_leng(yyscan_t yyscanner);
54		char *yyget_text(yyscan_t yyscanner);
55		typedef struct yy_buffer_state *YY_BUFFER_STATE;
56		int yylex_init(yyscan_t* scanner);
57		YY_BUFFER_STATE yy_scan_string(const char * yystr, yyscan_t yyscanner);
58		YY_BUFFER_STATE yy_scan_bytes(const char * yystr, int len, yyscan_t yyscanner);
59		int yylex(yyscan_t yyscanner);
60		int yylex_destroy(yyscan_t yyscanner);
61		void *yyalloc(size_t size);
62		void yyrealloc(void p, size_t size);
63
64	13.2k	void *yyalloc(size_t size) {
65	13.2k	return wget_malloc(size);
66	13.2k	}
67	0	void yyrealloc(void p, size_t size) {
68	0	return wget_realloc(p, size);
69	0	}
70
71		void wget_css_parse_buffer(
72		const char *buf,
73		size_t len,
74		wget_css_parse_uri_callback *callback_uri,
75		wget_css_parse_encoding_callback *callback_encoding,
76		void *user_ctx)
77	3.30k	{
78	3.30k	int token;
79	3.30k	size_t length, pos = 0;
80	3.30k	char *text;
81	3.30k	yyscan_t scanner;
82
83	3.30k	yylex_init(&scanner);
84	3.30k	yy_scan_bytes(buf, (int) len, scanner);
85
86	75.9k	while ((token = yylex(scanner)) != CSSEOF) {
87	72.6k	if (token == IMPORT_SYM) {
88		// e.g. @import "https://example.com/index.html"
89	2.85k	pos += yyget_leng(scanner);
90
91		// skip whitespace before URI/STRING
92	3.66k	while ((token = yylex(scanner)) == S)
93	813	pos += yyget_leng(scanner);
94
95		// now token should be STRING or URI
96	2.85k	if (token == STRING)
97	1.17k	token = URI;
98	2.85k	}
99
100	72.6k	if (token == URI && callback_uri) {
101		// e.g. url(https://example.com/index.html)
102	16.6k	text = yyget_text(scanner);
103	16.6k	length = yyget_leng(scanner);
104
105	16.6k	if (text == '\'' \|\| text == '\"') {
106		// a string - remove the quotes
107	791	callback_uri(user_ctx, text + 1, length - 2, pos + 1);
108	15.8k	} else {
109		// extract URI from url(...)
110	15.8k	if (!wget_strncasecmp_ascii(text, "url(", 4)) {
111	14.6k	char *otext = text;
112
113		// remove trailing ) and any spaces before
114	15.9k	for (length--; c_isspace(text[length - 1]); length--);
115
116		// remove leading url( and any spaces after
117	15.4k	for (length -= 4, text += 4; length && c_isspace(*text); text++, length--);
118
119		// remove quotes
120	14.6k	if (length && (text == '\'' \|\| text == '\"')) {
121	834	text++;
122	834	length--;
123	834	}
124
125	14.6k	if (length && (text[length - 1] == '\'' \|\| text[length - 1] == '\"'))
126	838	length--;
127
128	14.6k	callback_uri(user_ctx, text, length, pos + (text - otext));
129	14.6k	}
130	15.8k	}
131	55.9k	} else if (token == CHARSET_SYM && callback_encoding) {
132		// e.g. @charset "UTF-8"
133	1.31k	pos += yyget_leng(scanner);
134
135		// skip whitespace before charset name
136	1.71k	while ((token = yylex(scanner)) == S)
137	401	pos += yyget_leng(scanner);
138
139		// now token should be STRING
140	1.31k	if (token == STRING) {
141	789	text = yyget_text(scanner);
142	789	length = yyget_leng(scanner);
143
144	789	if (text == '\'' \|\| text == '\"') {
145		// a string - remove the quotes
146	391	callback_encoding(user_ctx, text + 1, length - 2);
147	398	} else {
148		// a string without quotes
149	398	callback_encoding(user_ctx, text, length);
150	398	}
151	789	} else {
152	521	error_printf(_("Unknown token after @charset: %d\n"), token);
153	521	}
154	1.31k	}
155	72.6k	pos += yyget_leng(scanner);
156	72.6k	}
157
158	3.30k	yylex_destroy(scanner);
159	3.30k	}
160
161		void wget_css_parse_file(
162		const char *fname,
163		wget_css_parse_uri_callback *callback_uri,
164		wget_css_parse_encoding_callback *callback_encoding,
165		void *user_ctx)
166	0	{
167	0	if (strcmp(fname,"-")) {
168	0	int fd;
169
170	0	if ((fd = open(fname, O_RDONLY\|O_BINARY)) != -1) {
171	0	struct stat st;
172	0	if (fstat(fd, &st) == 0) {
173	0	#ifdef HAVE_MMAP
174	0	size_t nread = st.st_size;
175	0	char *buf = mmap(NULL, nread + 1, PROT_READ\|PROT_WRITE, MAP_PRIVATE, fd, 0);
176		#else
177		char *buf=wget_malloc(st.st_size+1);
178		size_t nread=read(fd,buf,st.st_size);
179		#endif
180
181	0	if (nread > 0) {
182	0	buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
183	0	wget_css_parse_buffer(buf, st.st_size, callback_uri, callback_encoding, user_ctx);
184	0	}
185
186	0	#ifdef HAVE_MMAP
187	0	munmap(buf, nread);
188		#else
189		xfree(buf);
190		#endif
191	0	}
192	0	close(fd);
193	0	} else
194	0	error_printf(_("Failed to open %s\n"), fname);
195	0	} else {
196		// read data from STDIN.
197		// maybe should use yy_scan_bytes instead of buffering into memory.
198	0	char tmp[4096];
199	0	ssize_t nbytes;
200	0	wget_buffer buf;
201
202	0	wget_buffer_init(&buf, NULL, 4096);
203
204	0	while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
205	0	wget_buffer_memcat(&buf, tmp, nbytes);
206	0	}
207
208	0	if (buf.length)
209	0	wget_css_parse_buffer(buf.data, buf.length, callback_uri, callback_encoding, user_ctx);
210
211	0	wget_buffer_deinit(&buf);
212	0	}
213	0	}