Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012 Tim Ruehsen |
3 | | * Copyright (c) 2015-2023 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of libwget. |
6 | | * |
7 | | * Libwget is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as published by |
9 | | * the Free Software Foundation, either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * Libwget is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libwget. If not, see <https://www.gnu.org/licenses/>. |
19 | | * |
20 | | * |
21 | | * css parsing routines |
22 | | * |
23 | | * Changelog |
24 | | * 03.07.2012 Tim Ruehsen created |
25 | | * |
26 | | * A parser using the flex tokenizer, created with flex tokens from |
27 | | * https://www.w3.org/TR/css3-syntax/ |
28 | | * |
29 | | * TODO: |
30 | | * - since we are just interested in @import ... and url(...), we could use |
31 | | * a simplistic hand-written parser which might be much smaller and faster |
32 | | */ |
33 | | |
34 | | #include <config.h> |
35 | | |
36 | | #include <stddef.h> |
37 | | #include <unistd.h> |
38 | | #include <string.h> |
39 | | #include <c-ctype.h> |
40 | | #include <fcntl.h> |
41 | | #include <sys/stat.h> |
42 | | #ifdef HAVE_MMAP |
43 | | #include <sys/mman.h> |
44 | | #endif |
45 | | |
46 | | #include <wget.h> |
47 | | #include "private.h" |
48 | | |
49 | | #include "css_tokenizer.h" |
50 | | |
51 | | // see css_tokenizer.c |
52 | | typedef void* yyscan_t; |
53 | | int yyget_leng(yyscan_t yyscanner); |
54 | | char *yyget_text(yyscan_t yyscanner); |
55 | | typedef struct yy_buffer_state *YY_BUFFER_STATE; |
56 | | int yylex_init(yyscan_t* scanner); |
57 | | YY_BUFFER_STATE yy_scan_string(const char * yystr, yyscan_t yyscanner); |
58 | | YY_BUFFER_STATE yy_scan_bytes(const char * yystr, int len, yyscan_t yyscanner); |
59 | | int yylex(yyscan_t yyscanner); |
60 | | int yylex_destroy(yyscan_t yyscanner); |
61 | | void *yyalloc(size_t size); |
62 | | void *yyrealloc(void *p, size_t size); |
63 | | |
64 | 13.2k | void *yyalloc(size_t size) { |
65 | 13.2k | return wget_malloc(size); |
66 | 13.2k | } |
67 | 0 | void *yyrealloc(void *p, size_t size) { |
68 | 0 | return wget_realloc(p, size); |
69 | 0 | } |
70 | | |
71 | | void wget_css_parse_buffer( |
72 | | const char *buf, |
73 | | size_t len, |
74 | | wget_css_parse_uri_callback *callback_uri, |
75 | | wget_css_parse_encoding_callback *callback_encoding, |
76 | | void *user_ctx) |
77 | 3.30k | { |
78 | 3.30k | int token; |
79 | 3.30k | size_t length, pos = 0; |
80 | 3.30k | char *text; |
81 | 3.30k | yyscan_t scanner; |
82 | | |
83 | 3.30k | yylex_init(&scanner); |
84 | 3.30k | yy_scan_bytes(buf, (int) len, scanner); |
85 | | |
86 | 75.9k | while ((token = yylex(scanner)) != CSSEOF) { |
87 | 72.6k | if (token == IMPORT_SYM) { |
88 | | // e.g. @import "https://example.com/index.html" |
89 | 2.85k | pos += yyget_leng(scanner); |
90 | | |
91 | | // skip whitespace before URI/STRING |
92 | 3.66k | while ((token = yylex(scanner)) == S) |
93 | 813 | pos += yyget_leng(scanner); |
94 | | |
95 | | // now token should be STRING or URI |
96 | 2.85k | if (token == STRING) |
97 | 1.17k | token = URI; |
98 | 2.85k | } |
99 | | |
100 | 72.6k | if (token == URI && callback_uri) { |
101 | | // e.g. url(https://example.com/index.html) |
102 | 16.6k | text = yyget_text(scanner); |
103 | 16.6k | length = yyget_leng(scanner); |
104 | | |
105 | 16.6k | if (*text == '\'' || *text == '\"') { |
106 | | // a string - remove the quotes |
107 | 791 | callback_uri(user_ctx, text + 1, length - 2, pos + 1); |
108 | 15.8k | } else { |
109 | | // extract URI from url(...) |
110 | 15.8k | if (!wget_strncasecmp_ascii(text, "url(", 4)) { |
111 | 14.6k | char *otext = text; |
112 | | |
113 | | // remove trailing ) and any spaces before |
114 | 15.9k | for (length--; c_isspace(text[length - 1]); length--); |
115 | | |
116 | | // remove leading url( and any spaces after |
117 | 15.4k | for (length -= 4, text += 4; length && c_isspace(*text); text++, length--); |
118 | | |
119 | | // remove quotes |
120 | 14.6k | if (length && (*text == '\'' || *text == '\"')) { |
121 | 834 | text++; |
122 | 834 | length--; |
123 | 834 | } |
124 | | |
125 | 14.6k | if (length && (text[length - 1] == '\'' || text[length - 1] == '\"')) |
126 | 838 | length--; |
127 | | |
128 | 14.6k | callback_uri(user_ctx, text, length, pos + (text - otext)); |
129 | 14.6k | } |
130 | 15.8k | } |
131 | 55.9k | } else if (token == CHARSET_SYM && callback_encoding) { |
132 | | // e.g. @charset "UTF-8" |
133 | 1.31k | pos += yyget_leng(scanner); |
134 | | |
135 | | // skip whitespace before charset name |
136 | 1.71k | while ((token = yylex(scanner)) == S) |
137 | 401 | pos += yyget_leng(scanner); |
138 | | |
139 | | // now token should be STRING |
140 | 1.31k | if (token == STRING) { |
141 | 789 | text = yyget_text(scanner); |
142 | 789 | length = yyget_leng(scanner); |
143 | | |
144 | 789 | if (*text == '\'' || *text == '\"') { |
145 | | // a string - remove the quotes |
146 | 391 | callback_encoding(user_ctx, text + 1, length - 2); |
147 | 398 | } else { |
148 | | // a string without quotes |
149 | 398 | callback_encoding(user_ctx, text, length); |
150 | 398 | } |
151 | 789 | } else { |
152 | 521 | error_printf(_("Unknown token after @charset: %d\n"), token); |
153 | 521 | } |
154 | 1.31k | } |
155 | 72.6k | pos += yyget_leng(scanner); |
156 | 72.6k | } |
157 | | |
158 | 3.30k | yylex_destroy(scanner); |
159 | 3.30k | } |
160 | | |
161 | | void wget_css_parse_file( |
162 | | const char *fname, |
163 | | wget_css_parse_uri_callback *callback_uri, |
164 | | wget_css_parse_encoding_callback *callback_encoding, |
165 | | void *user_ctx) |
166 | 0 | { |
167 | 0 | if (strcmp(fname,"-")) { |
168 | 0 | int fd; |
169 | |
|
170 | 0 | if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) { |
171 | 0 | struct stat st; |
172 | 0 | if (fstat(fd, &st) == 0) { |
173 | 0 | #ifdef HAVE_MMAP |
174 | 0 | size_t nread = st.st_size; |
175 | 0 | char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); |
176 | | #else |
177 | | char *buf=wget_malloc(st.st_size+1); |
178 | | size_t nread=read(fd,buf,st.st_size); |
179 | | #endif |
180 | |
|
181 | 0 | if (nread > 0) { |
182 | 0 | buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system |
183 | 0 | wget_css_parse_buffer(buf, st.st_size, callback_uri, callback_encoding, user_ctx); |
184 | 0 | } |
185 | |
|
186 | 0 | #ifdef HAVE_MMAP |
187 | 0 | munmap(buf, nread); |
188 | | #else |
189 | | xfree(buf); |
190 | | #endif |
191 | 0 | } |
192 | 0 | close(fd); |
193 | 0 | } else |
194 | 0 | error_printf(_("Failed to open %s\n"), fname); |
195 | 0 | } else { |
196 | | // read data from STDIN. |
197 | | // maybe should use yy_scan_bytes instead of buffering into memory. |
198 | 0 | char tmp[4096]; |
199 | 0 | ssize_t nbytes; |
200 | 0 | wget_buffer buf; |
201 | |
|
202 | 0 | wget_buffer_init(&buf, NULL, 4096); |
203 | |
|
204 | 0 | while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) { |
205 | 0 | wget_buffer_memcat(&buf, tmp, nbytes); |
206 | 0 | } |
207 | |
|
208 | 0 | if (buf.length) |
209 | 0 | wget_css_parse_buffer(buf.data, buf.length, callback_uri, callback_encoding, user_ctx); |
210 | |
|
211 | 0 | wget_buffer_deinit(&buf); |
212 | 0 | } |
213 | 0 | } |