/src/readstat/src/spss/readstat_sav_read.c
Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | #include <stdio.h> |
3 | | #include <stdlib.h> |
4 | | #include <string.h> |
5 | | #include <sys/types.h> |
6 | | #include <stdint.h> |
7 | | #include <math.h> |
8 | | #include <float.h> |
9 | | #include <time.h> |
10 | | #include <limits.h> |
11 | | |
12 | | #include "../readstat.h" |
13 | | #include "../readstat_bits.h" |
14 | | #include "../readstat_iconv.h" |
15 | | #include "../readstat_convert.h" |
16 | | #include "../readstat_malloc.h" |
17 | | |
18 | | #include "readstat_sav.h" |
19 | | #include "readstat_sav_compress.h" |
20 | | #include "readstat_sav_parse.h" |
21 | | #include "readstat_sav_parse_timestamp.h" |
22 | | |
23 | | #if HAVE_ZLIB |
24 | | #include "readstat_zsav_read.h" |
25 | | #endif |
26 | | |
27 | | #define DATA_BUFFER_SIZE 65536 |
28 | 166k | #define VERY_LONG_STRING_MAX_LENGTH INT_MAX |
29 | | |
30 | | /* Others defined in table below */ |
31 | | |
32 | | /* See http://msdn.microsoft.com/en-us/library/dd317756(VS.85).aspx */ |
33 | | static readstat_charset_entry_t _charset_table[] = { |
34 | | { .code = 1, .name = "EBCDIC-US" }, |
35 | | { .code = 2, .name = "WINDOWS-1252" }, /* supposed to be ASCII, but some files are miscoded */ |
36 | | { .code = 3, .name = "WINDOWS-1252" }, |
37 | | { .code = 4, .name = "DEC-KANJI" }, |
38 | | { .code = 437, .name = "CP437" }, |
39 | | { .code = 708, .name = "ASMO-708" }, |
40 | | { .code = 737, .name = "CP737" }, |
41 | | { .code = 775, .name = "CP775" }, |
42 | | { .code = 850, .name = "CP850" }, |
43 | | { .code = 852, .name = "CP852" }, |
44 | | { .code = 855, .name = "CP855" }, |
45 | | { .code = 857, .name = "CP857" }, |
46 | | { .code = 858, .name = "CP858" }, |
47 | | { .code = 860, .name = "CP860" }, |
48 | | { .code = 861, .name = "CP861" }, |
49 | | { .code = 862, .name = "CP862" }, |
50 | | { .code = 863, .name = "CP863" }, |
51 | | { .code = 864, .name = "CP864" }, |
52 | | { .code = 865, .name = "CP865" }, |
53 | | { .code = 866, .name = "CP866" }, |
54 | | { .code = 869, .name = "CP869" }, |
55 | | { .code = 874, .name = "CP874" }, |
56 | | { .code = 932, .name = "CP932" }, |
57 | | { .code = 936, .name = "CP936" }, |
58 | | { .code = 949, .name = "CP949" }, |
59 | | { .code = 950, .name = "BIG-5" }, |
60 | | { .code = 1200, .name = "UTF-16LE" }, |
61 | | { .code = 1201, .name = "UTF-16BE" }, |
62 | | { .code = 1250, .name = "WINDOWS-1250" }, |
63 | | { .code = 1251, .name = "WINDOWS-1251" }, |
64 | | { .code = 1252, .name = "WINDOWS-1252" }, |
65 | | { .code = 1253, .name = "WINDOWS-1253" }, |
66 | | { .code = 1254, .name = "WINDOWS-1254" }, |
67 | | { .code = 1255, .name = "WINDOWS-1255" }, |
68 | | { .code = 1256, .name = "WINDOWS-1256" }, |
69 | | { .code = 1257, .name = "WINDOWS-1257" }, |
70 | | { .code = 1258, .name = "WINDOWS-1258" }, |
71 | | { .code = 1361, .name = "CP1361" }, |
72 | | { .code = 10000, .name = "MACROMAN" }, |
73 | | { .code = 10004, .name = "MACARABIC" }, |
74 | | { .code = 10005, .name = "MACHEBREW" }, |
75 | | { .code = 10006, .name = "MACGREEK" }, |
76 | | { .code = 10007, .name = "MACCYRILLIC" }, |
77 | | { .code = 10010, .name = "MACROMANIA" }, |
78 | | { .code = 10017, .name = "MACUKRAINE" }, |
79 | | { .code = 10021, .name = "MACTHAI" }, |
80 | | { .code = 10029, .name = "MACCENTRALEUROPE" }, |
81 | | { .code = 10079, .name = "MACICELAND" }, |
82 | | { .code = 10081, .name = "MACTURKISH" }, |
83 | | { .code = 10082, .name = "MACCROATIAN" }, |
84 | | { .code = 12000, .name = "UTF-32LE" }, |
85 | | { .code = 12001, .name = "UTF-32BE" }, |
86 | | { .code = 20127, .name = "US-ASCII" }, |
87 | | { .code = 20866, .name = "KOI8-R" }, |
88 | | { .code = 20932, .name = "EUC-JP" }, |
89 | | { .code = 21866, .name = "KOI8-U" }, |
90 | | { .code = 28591, .name = "ISO-8859-1" }, |
91 | | { .code = 28592, .name = "ISO-8859-2" }, |
92 | | { .code = 28593, .name = "ISO-8859-3" }, |
93 | | { .code = 28594, .name = "ISO-8859-4" }, |
94 | | { .code = 28595, .name = "ISO-8859-5" }, |
95 | | { .code = 28596, .name = "ISO-8859-6" }, |
96 | | { .code = 28597, .name = "ISO-8859-7" }, |
97 | | { .code = 28598, .name = "ISO-8859-8" }, |
98 | | { .code = 28599, .name = "ISO-8859-9" }, |
99 | | { .code = 28603, .name = "ISO-8859-13" }, |
100 | | { .code = 28605, .name = "ISO-8859-15" }, |
101 | | { .code = 50220, .name = "ISO-2022-JP" }, |
102 | | { .code = 50221, .name = "ISO-2022-JP" }, // same as above? |
103 | | { .code = 50222, .name = "ISO-2022-JP" }, // same as above? |
104 | | { .code = 50225, .name = "ISO-2022-KR" }, |
105 | | { .code = 50229, .name = "ISO-2022-CN" }, |
106 | | { .code = 51932, .name = "EUC-JP" }, |
107 | | { .code = 51936, .name = "GBK" }, |
108 | | { .code = 51949, .name = "EUC-KR" }, |
109 | | { .code = 52936, .name = "HZ-GB-2312" }, |
110 | | { .code = 54936, .name = "GB18030" }, |
111 | | { .code = 65000, .name = "UTF-7" }, |
112 | | { .code = 65001, .name = "UTF-8" } |
113 | | }; |
114 | | |
115 | 185k | #define SAV_LABEL_NAME_PREFIX "labels" |
116 | | |
117 | | typedef struct value_label_s { |
118 | | char raw_value[8]; |
119 | | char utf8_string_value[8*4+1]; |
120 | | readstat_value_t final_value; |
121 | | char *label; |
122 | | } value_label_t; |
123 | | |
124 | | static readstat_error_t sav_update_progress(sav_ctx_t *ctx); |
125 | | static readstat_error_t sav_read_data(sav_ctx_t *ctx); |
126 | | static readstat_error_t sav_read_compressed_data(sav_ctx_t *ctx, |
127 | | readstat_error_t (*row_handler)(unsigned char *, size_t, sav_ctx_t *)); |
128 | | static readstat_error_t sav_read_uncompressed_data(sav_ctx_t *ctx, |
129 | | readstat_error_t (*row_handler)(unsigned char *, size_t, sav_ctx_t *)); |
130 | | |
131 | | static readstat_error_t sav_skip_variable_record(sav_ctx_t *ctx); |
132 | | static readstat_error_t sav_read_variable_record(sav_ctx_t *ctx); |
133 | | |
134 | | static readstat_error_t sav_skip_document_record(sav_ctx_t *ctx); |
135 | | static readstat_error_t sav_read_document_record(sav_ctx_t *ctx); |
136 | | |
137 | | static readstat_error_t sav_skip_value_label_record(sav_ctx_t *ctx); |
138 | | static readstat_error_t sav_read_value_label_record(sav_ctx_t *ctx); |
139 | | |
140 | | static readstat_error_t sav_read_dictionary_termination_record(sav_ctx_t *ctx); |
141 | | |
142 | | static readstat_error_t sav_parse_machine_floating_point_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx); |
143 | | static readstat_error_t sav_store_variable_display_parameter_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx); |
144 | | static readstat_error_t sav_parse_variable_display_parameter_record(sav_ctx_t *ctx); |
145 | | static readstat_error_t sav_parse_machine_integer_info_record(const void *data, size_t data_len, sav_ctx_t *ctx); |
146 | | static readstat_error_t sav_parse_long_string_value_labels_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx); |
147 | | static readstat_error_t sav_parse_long_string_missing_values_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx); |
148 | | |
149 | 1.35M | static void sav_tag_missing_double(readstat_value_t *value, sav_ctx_t *ctx) { |
150 | 1.35M | double fp_value = value->v.double_value; |
151 | 1.35M | uint64_t long_value = 0; |
152 | 1.35M | memcpy(&long_value, &fp_value, 8); |
153 | 1.35M | if (long_value == ctx->missing_double) |
154 | 99.7k | value->is_system_missing = 1; |
155 | 1.35M | if (long_value == ctx->lowest_double) |
156 | 411 | value->is_system_missing = 1; |
157 | 1.35M | if (long_value == ctx->highest_double) |
158 | 426 | value->is_system_missing = 1; |
159 | 1.35M | if (isnan(fp_value)) |
160 | 64.3k | value->is_system_missing = 1; |
161 | 1.35M | } |
162 | | |
163 | 6.07k | static readstat_error_t sav_update_progress(sav_ctx_t *ctx) { |
164 | 6.07k | readstat_io_t *io = ctx->io; |
165 | 6.07k | return io->update(ctx->file_size, ctx->handle.progress, ctx->user_ctx, io->io_ctx); |
166 | 6.07k | } |
167 | | |
168 | 281k | static readstat_error_t sav_skip_variable_record(sav_ctx_t *ctx) { |
169 | 281k | sav_variable_record_t variable; |
170 | 281k | readstat_error_t retval = READSTAT_OK; |
171 | 281k | readstat_io_t *io = ctx->io; |
172 | 281k | if (io->read(&variable, sizeof(sav_variable_record_t), io->io_ctx) < sizeof(sav_variable_record_t)) { |
173 | 9 | retval = READSTAT_ERROR_READ; |
174 | 9 | goto cleanup; |
175 | 9 | } |
176 | 281k | if (variable.has_var_label) { |
177 | 4.59k | uint32_t label_len; |
178 | 4.59k | if (io->read(&label_len, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
179 | 50 | retval = READSTAT_ERROR_READ; |
180 | 50 | goto cleanup; |
181 | 50 | } |
182 | 4.54k | label_len = ctx->bswap ? byteswap4(label_len) : label_len; |
183 | 4.54k | uint32_t label_capacity = (label_len + 3) / 4 * 4; |
184 | 4.54k | if (io->seek(label_capacity, READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
185 | 6 | retval = READSTAT_ERROR_SEEK; |
186 | 6 | goto cleanup; |
187 | 6 | } |
188 | 4.54k | } |
189 | 281k | if (variable.n_missing_values) { |
190 | 6.63k | int n_missing_values = ctx->bswap ? byteswap4(variable.n_missing_values) : variable.n_missing_values; |
191 | 6.63k | if (io->seek(abs(n_missing_values) * sizeof(double), READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
192 | 80 | retval = READSTAT_ERROR_SEEK; |
193 | 80 | goto cleanup; |
194 | 80 | } |
195 | 6.63k | } |
196 | 281k | cleanup: |
197 | 281k | return retval; |
198 | 281k | } |
199 | | |
200 | 1.67k | static readstat_error_t sav_read_variable_label(spss_varinfo_t *info, sav_ctx_t *ctx) { |
201 | 1.67k | readstat_io_t *io = ctx->io; |
202 | 1.67k | readstat_error_t retval = READSTAT_OK; |
203 | 1.67k | uint32_t label_len, label_capacity; |
204 | 1.67k | size_t out_label_len; |
205 | 1.67k | char *label_buf = NULL; |
206 | 1.67k | if (io->read(&label_len, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
207 | 6 | retval = READSTAT_ERROR_READ; |
208 | 6 | goto cleanup; |
209 | 6 | } |
210 | 1.67k | label_len = ctx->bswap ? byteswap4(label_len) : label_len; |
211 | | |
212 | 1.67k | if (label_len == 0) |
213 | 876 | goto cleanup; |
214 | | |
215 | 795 | label_capacity = (label_len + 3) / 4 * 4; |
216 | 795 | if ((label_buf = readstat_malloc(label_capacity)) == NULL) { |
217 | 34 | retval = READSTAT_ERROR_MALLOC; |
218 | 34 | goto cleanup; |
219 | 34 | } |
220 | | |
221 | 761 | out_label_len = (size_t)label_len*4+1; |
222 | 761 | if ((info->label = readstat_malloc(out_label_len)) == NULL) { |
223 | 2 | retval = READSTAT_ERROR_MALLOC; |
224 | 2 | goto cleanup; |
225 | 2 | } |
226 | | |
227 | 759 | if (io->read(label_buf, label_capacity, io->io_ctx) < label_capacity) { |
228 | 24 | retval = READSTAT_ERROR_READ; |
229 | 24 | goto cleanup; |
230 | 24 | } |
231 | | |
232 | 735 | retval = readstat_convert(info->label, out_label_len, label_buf, label_len, ctx->converter); |
233 | 735 | if (retval != READSTAT_OK) |
234 | 0 | goto cleanup; |
235 | | |
236 | 1.67k | cleanup: |
237 | 1.67k | if (label_buf) |
238 | 761 | free(label_buf); |
239 | | |
240 | 1.67k | if (retval != READSTAT_OK) { |
241 | 66 | if (info->label) { |
242 | 24 | free(info->label); |
243 | 24 | info->label = NULL; |
244 | 24 | } |
245 | 66 | } |
246 | | |
247 | 1.67k | return retval; |
248 | 735 | } |
249 | | |
250 | 1.43k | static readstat_error_t sav_read_variable_missing_double_values(spss_varinfo_t *info, sav_ctx_t *ctx) { |
251 | 1.43k | readstat_io_t *io = ctx->io; |
252 | 1.43k | int i; |
253 | 1.43k | readstat_error_t retval = READSTAT_OK; |
254 | 1.43k | if (io->read(info->missing_double_values, info->n_missing_values * sizeof(double), io->io_ctx) |
255 | 1.43k | < info->n_missing_values * sizeof(double)) { |
256 | 6 | retval = READSTAT_ERROR_READ; |
257 | 6 | goto cleanup; |
258 | 6 | } |
259 | 4.08k | for (i=0; i<info->n_missing_values; i++) { |
260 | 2.65k | if (ctx->bswap) { |
261 | 1.08k | info->missing_double_values[i] = byteswap_double(info->missing_double_values[i]); |
262 | 1.08k | } |
263 | | |
264 | 2.65k | uint64_t long_value = 0; |
265 | 2.65k | memcpy(&long_value, &info->missing_double_values[i], 8); |
266 | | |
267 | 2.65k | if (long_value == ctx->missing_double) |
268 | 197 | info->missing_double_values[i] = NAN; |
269 | 2.65k | if (long_value == ctx->lowest_double) |
270 | 283 | info->missing_double_values[i] = -HUGE_VAL; |
271 | 2.65k | if (long_value == ctx->highest_double) |
272 | 289 | info->missing_double_values[i] = HUGE_VAL; |
273 | 2.65k | } |
274 | | |
275 | 1.43k | cleanup: |
276 | 1.43k | return retval; |
277 | 1.42k | } |
278 | | |
279 | 1.07k | static readstat_error_t sav_read_variable_missing_string_values(spss_varinfo_t *info, sav_ctx_t *ctx) { |
280 | 1.07k | readstat_io_t *io = ctx->io; |
281 | 1.07k | int i; |
282 | 1.07k | readstat_error_t retval = READSTAT_OK; |
283 | 3.51k | for (i=0; i<info->n_missing_values; i++) { |
284 | 2.44k | char missing_value[8]; |
285 | 2.44k | if (io->read(missing_value, sizeof(missing_value), io->io_ctx) < sizeof(missing_value)) { |
286 | 5 | retval = READSTAT_ERROR_READ; |
287 | 5 | goto cleanup; |
288 | 5 | } |
289 | 2.43k | retval = readstat_convert(info->missing_string_values[i], sizeof(info->missing_string_values[0]), |
290 | 2.43k | missing_value, sizeof(missing_value), ctx->converter); |
291 | 2.43k | if (retval != READSTAT_OK) |
292 | 1 | goto cleanup; |
293 | 2.43k | } |
294 | | |
295 | 1.07k | cleanup: |
296 | 1.07k | return retval; |
297 | 1.07k | } |
298 | | |
299 | 2.55k | static readstat_error_t sav_read_variable_missing_values(spss_varinfo_t *info, sav_ctx_t *ctx) { |
300 | 2.55k | if (info->n_missing_values > 3 || info->n_missing_values < -3) { |
301 | 52 | return READSTAT_ERROR_PARSE; |
302 | 52 | } |
303 | 2.50k | if (info->n_missing_values < 0) { |
304 | 1.54k | info->missing_range = 1; |
305 | 1.54k | info->n_missing_values = abs(info->n_missing_values); |
306 | 1.54k | } else { |
307 | 957 | info->missing_range = 0; |
308 | 957 | } |
309 | 2.50k | if (info->type == READSTAT_TYPE_DOUBLE) { |
310 | 1.43k | return sav_read_variable_missing_double_values(info, ctx); |
311 | 1.43k | } |
312 | 1.07k | return sav_read_variable_missing_string_values(info, ctx); |
313 | 2.50k | } |
314 | | |
315 | 302k | static readstat_error_t sav_read_variable_record(sav_ctx_t *ctx) { |
316 | 302k | readstat_io_t *io = ctx->io; |
317 | 302k | sav_variable_record_t variable = { 0 }; |
318 | 302k | spss_varinfo_t *info = NULL; |
319 | 302k | readstat_error_t retval = READSTAT_OK; |
320 | 302k | if (ctx->var_index == ctx->varinfo_capacity) { |
321 | 178 | if ((ctx->varinfo = readstat_realloc(ctx->varinfo, (ctx->varinfo_capacity *= 2) * sizeof(spss_varinfo_t *))) == NULL) { |
322 | 0 | retval = READSTAT_ERROR_MALLOC; |
323 | 0 | goto cleanup; |
324 | 0 | } |
325 | 178 | } |
326 | 302k | if (io->read(&variable, sizeof(sav_variable_record_t), io->io_ctx) < sizeof(sav_variable_record_t)) { |
327 | 9 | retval = READSTAT_ERROR_READ; |
328 | 9 | goto cleanup; |
329 | 9 | } |
330 | 302k | variable.print = ctx->bswap ? byteswap4(variable.print) : variable.print; |
331 | 302k | variable.write = ctx->bswap ? byteswap4(variable.write) : variable.write; |
332 | | |
333 | 302k | int32_t type = ctx->bswap ? byteswap4(variable.type) : variable.type; |
334 | 302k | if (type < 0) { |
335 | 2.40k | if (ctx->var_index == 0) { |
336 | 33 | return READSTAT_ERROR_PARSE; |
337 | 33 | } |
338 | 2.37k | ctx->var_offset++; |
339 | 2.37k | ctx->varinfo[ctx->var_index-1]->width++; |
340 | 2.37k | return 0; |
341 | 2.40k | } |
342 | | |
343 | 300k | if ((info = readstat_calloc(1, sizeof(spss_varinfo_t))) == NULL) { |
344 | 0 | retval = READSTAT_ERROR_MALLOC; |
345 | 0 | goto cleanup; |
346 | 0 | } |
347 | 300k | info->width = 1; |
348 | 300k | info->n_segments = 1; |
349 | 300k | info->index = ctx->var_index; |
350 | 300k | info->offset = ctx->var_offset; |
351 | 300k | info->labels_index = -1; |
352 | | |
353 | 300k | retval = readstat_convert(info->name, sizeof(info->name), |
354 | 300k | variable.name, sizeof(variable.name), NULL); |
355 | 300k | if (retval != READSTAT_OK) |
356 | 0 | goto cleanup; |
357 | | |
358 | 300k | retval = readstat_convert(info->longname, sizeof(info->longname), |
359 | 300k | variable.name, sizeof(variable.name), NULL); |
360 | 300k | if (retval != READSTAT_OK) |
361 | 0 | goto cleanup; |
362 | | |
363 | 300k | info->print_format.decimal_places = (variable.print & 0x000000FF); |
364 | 300k | info->print_format.width = (variable.print & 0x0000FF00) >> 8; |
365 | 300k | info->print_format.type = (variable.print & 0x00FF0000) >> 16; |
366 | | |
367 | 300k | info->write_format.decimal_places = (variable.write & 0x000000FF); |
368 | 300k | info->write_format.width = (variable.write & 0x0000FF00) >> 8; |
369 | 300k | info->write_format.type = (variable.write & 0x00FF0000) >> 16; |
370 | | |
371 | 300k | if (type > 0 || info->print_format.type == SPSS_FORMAT_TYPE_A || info->write_format.type == SPSS_FORMAT_TYPE_A) { |
372 | 245k | info->type = READSTAT_TYPE_STRING; |
373 | 245k | } else { |
374 | 54.1k | info->type = READSTAT_TYPE_DOUBLE; |
375 | 54.1k | } |
376 | | |
377 | 300k | if (variable.has_var_label) { |
378 | 1.67k | if ((retval = sav_read_variable_label(info, ctx)) != READSTAT_OK) { |
379 | 66 | goto cleanup; |
380 | 66 | } |
381 | 1.67k | } |
382 | | |
383 | 299k | if (variable.n_missing_values) { |
384 | 2.55k | info->n_missing_values = ctx->bswap ? byteswap4(variable.n_missing_values) : variable.n_missing_values; |
385 | 2.55k | if ((retval = sav_read_variable_missing_values(info, ctx)) != READSTAT_OK) { |
386 | 64 | goto cleanup; |
387 | 64 | } |
388 | 2.55k | } |
389 | | |
390 | 299k | ctx->varinfo[ctx->var_index] = info; |
391 | | |
392 | 299k | ctx->var_index++; |
393 | 299k | ctx->var_offset++; |
394 | | |
395 | 300k | cleanup: |
396 | 300k | if (retval != READSTAT_OK) { |
397 | 139 | spss_varinfo_free(info); |
398 | 139 | } |
399 | | |
400 | 300k | return retval; |
401 | 299k | } |
402 | | |
403 | 28.5k | static readstat_error_t sav_skip_value_label_record(sav_ctx_t *ctx) { |
404 | 28.5k | uint32_t label_count; |
405 | 28.5k | uint32_t rec_type; |
406 | 28.5k | uint32_t var_count; |
407 | 28.5k | readstat_error_t retval = READSTAT_OK; |
408 | 28.5k | readstat_io_t *io = ctx->io; |
409 | 28.5k | if (io->read(&label_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
410 | 3 | retval = READSTAT_ERROR_READ; |
411 | 3 | goto cleanup; |
412 | 3 | } |
413 | 28.5k | if (ctx->bswap) |
414 | 27.2k | label_count = byteswap4(label_count); |
415 | 28.5k | int i; |
416 | 235k | for (i=0; i<label_count; i++) { |
417 | 207k | unsigned char unpadded_len = 0; |
418 | 207k | size_t padded_len = 0; |
419 | 207k | if (io->seek(8, READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
420 | 79 | retval = READSTAT_ERROR_SEEK; |
421 | 79 | goto cleanup; |
422 | 79 | } |
423 | 207k | if (io->read(&unpadded_len, 1, io->io_ctx) < 1) { |
424 | 19 | retval = READSTAT_ERROR_READ; |
425 | 19 | goto cleanup; |
426 | 19 | } |
427 | 207k | padded_len = (unpadded_len + 8) / 8 * 8 - 1; |
428 | 207k | if (io->seek(padded_len, READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
429 | 3 | retval = READSTAT_ERROR_SEEK; |
430 | 3 | goto cleanup; |
431 | 3 | } |
432 | 207k | } |
433 | | |
434 | 28.4k | if (io->read(&rec_type, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
435 | 17 | retval = READSTAT_ERROR_READ; |
436 | 17 | goto cleanup; |
437 | 17 | } |
438 | 28.4k | if (ctx->bswap) |
439 | 27.1k | rec_type = byteswap4(rec_type); |
440 | | |
441 | 28.4k | if (rec_type != 4) { |
442 | 52 | retval = READSTAT_ERROR_PARSE; |
443 | 52 | goto cleanup; |
444 | 52 | } |
445 | 28.4k | if (io->read(&var_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
446 | 5 | retval = READSTAT_ERROR_READ; |
447 | 5 | goto cleanup; |
448 | 5 | } |
449 | 28.4k | if (ctx->bswap) |
450 | 27.1k | var_count = byteswap4(var_count); |
451 | | |
452 | 28.4k | if (io->seek(var_count * sizeof(uint32_t), READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
453 | 9 | retval = READSTAT_ERROR_SEEK; |
454 | 9 | goto cleanup; |
455 | 9 | } |
456 | | |
457 | 28.5k | cleanup: |
458 | 28.5k | return retval; |
459 | 28.4k | } |
460 | | |
461 | | static readstat_error_t sav_submit_value_labels(value_label_t *value_labels, int32_t label_count, |
462 | 18.2k | readstat_type_t value_type, sav_ctx_t *ctx) { |
463 | 18.2k | char label_name_buf[256]; |
464 | 18.2k | readstat_error_t retval = READSTAT_OK; |
465 | 18.2k | int32_t i; |
466 | | |
467 | 18.2k | snprintf(label_name_buf, sizeof(label_name_buf), SAV_LABEL_NAME_PREFIX "%d", ctx->value_labels_count); |
468 | | |
469 | 88.5k | for (i=0; i<label_count; i++) { |
470 | 70.2k | value_label_t *vlabel = &value_labels[i]; |
471 | 70.2k | if (ctx->handle.value_label(label_name_buf, vlabel->final_value, vlabel->label, ctx->user_ctx) != READSTAT_HANDLER_OK) { |
472 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
473 | 0 | goto cleanup; |
474 | 0 | } |
475 | 70.2k | } |
476 | 18.2k | cleanup: |
477 | 18.2k | return retval; |
478 | 18.2k | } |
479 | | |
480 | 18.4k | static readstat_error_t sav_read_value_label_record(sav_ctx_t *ctx) { |
481 | 18.4k | uint32_t label_count; |
482 | 18.4k | readstat_error_t retval = READSTAT_OK; |
483 | 18.4k | readstat_io_t *io = ctx->io; |
484 | 18.4k | uint32_t *vars = NULL; |
485 | 18.4k | uint32_t var_count; |
486 | 18.4k | int32_t rec_type; |
487 | 18.4k | readstat_type_t value_type = READSTAT_TYPE_STRING; |
488 | 18.4k | char label_buf[256]; |
489 | 18.4k | value_label_t *value_labels = NULL; |
490 | | |
491 | 18.4k | if (io->read(&label_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
492 | 3 | retval = READSTAT_ERROR_READ; |
493 | 3 | goto cleanup; |
494 | 3 | } |
495 | 18.4k | if (ctx->bswap) |
496 | 17.5k | label_count = byteswap4(label_count); |
497 | | |
498 | 18.4k | if (label_count && (value_labels = readstat_calloc(label_count, sizeof(value_label_t))) == NULL) { |
499 | 26 | retval = READSTAT_ERROR_MALLOC; |
500 | 26 | goto cleanup; |
501 | 26 | } |
502 | | |
503 | 18.4k | int i; |
504 | 88.6k | for (i=0; i<label_count; i++) { |
505 | 70.2k | value_label_t *vlabel = &value_labels[i]; |
506 | 70.2k | unsigned char unpadded_label_len = 0; |
507 | 70.2k | size_t padded_label_len = 0, utf8_label_len = 0; |
508 | | |
509 | 70.2k | if (io->read(vlabel->raw_value, 8, io->io_ctx) < 8) { |
510 | 30 | retval = READSTAT_ERROR_READ; |
511 | 30 | goto cleanup; |
512 | 30 | } |
513 | 70.2k | if (io->read(&unpadded_label_len, 1, io->io_ctx) < 1) { |
514 | 1 | retval = READSTAT_ERROR_READ; |
515 | 1 | goto cleanup; |
516 | 1 | } |
517 | | |
518 | 70.2k | padded_label_len = (unpadded_label_len + 8) / 8 * 8 - 1; |
519 | 70.2k | if (io->read(label_buf, padded_label_len, io->io_ctx) < padded_label_len) { |
520 | 3 | retval = READSTAT_ERROR_READ; |
521 | 3 | goto cleanup; |
522 | 3 | } |
523 | | |
524 | 70.2k | utf8_label_len = padded_label_len*4+1; |
525 | 70.2k | if ((vlabel->label = readstat_malloc(utf8_label_len)) == NULL) { |
526 | 0 | retval = READSTAT_ERROR_MALLOC; |
527 | 0 | goto cleanup; |
528 | 0 | } |
529 | | |
530 | 70.2k | retval = readstat_convert(vlabel->label, utf8_label_len, label_buf, padded_label_len, ctx->converter); |
531 | 70.2k | if (retval != READSTAT_OK) |
532 | 1 | goto cleanup; |
533 | 70.2k | } |
534 | | |
535 | 18.3k | if (io->read(&rec_type, sizeof(int32_t), io->io_ctx) < sizeof(int32_t)) { |
536 | 3 | retval = READSTAT_ERROR_READ; |
537 | 3 | goto cleanup; |
538 | 3 | } |
539 | 18.3k | if (ctx->bswap) |
540 | 17.5k | rec_type = byteswap4(rec_type); |
541 | | |
542 | 18.3k | if (rec_type != 4) { |
543 | 46 | retval = READSTAT_ERROR_PARSE; |
544 | 46 | goto cleanup; |
545 | 46 | } |
546 | 18.3k | if (io->read(&var_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
547 | 3 | retval = READSTAT_ERROR_READ; |
548 | 3 | goto cleanup; |
549 | 3 | } |
550 | 18.3k | if (ctx->bswap) |
551 | 17.5k | var_count = byteswap4(var_count); |
552 | | |
553 | 18.3k | if (var_count && (vars = readstat_malloc(var_count * sizeof(uint32_t))) == NULL) { |
554 | 37 | retval = READSTAT_ERROR_MALLOC; |
555 | 37 | goto cleanup; |
556 | 37 | } |
557 | 18.3k | if (io->read(vars, var_count * sizeof(uint32_t), io->io_ctx) < var_count * sizeof(uint32_t)) { |
558 | 24 | retval = READSTAT_ERROR_READ; |
559 | 24 | goto cleanup; |
560 | 24 | } |
561 | 100k | for (i=0; i<var_count; i++) { |
562 | 82.5k | uint32_t var_offset = vars[i]; |
563 | 82.5k | if (ctx->bswap) |
564 | 15.2k | var_offset = byteswap4(var_offset); |
565 | | |
566 | 82.5k | var_offset--; // Why subtract 1???? |
567 | 82.5k | spss_varinfo_t **var = bsearch(&var_offset, ctx->varinfo, ctx->var_index, sizeof(spss_varinfo_t *), |
568 | 82.5k | &spss_varinfo_compare); |
569 | 82.5k | if (var) { |
570 | 12.7k | (*var)->labels_index = ctx->value_labels_count; |
571 | 12.7k | value_type = (*var)->type; |
572 | 12.7k | } |
573 | 82.5k | } |
574 | | |
575 | 88.5k | for (i=0; i<label_count; i++) { |
576 | 70.2k | value_label_t *vlabel = &value_labels[i]; |
577 | 70.2k | double val_d = 0.0; |
578 | 70.2k | vlabel->final_value.type = value_type; |
579 | 70.2k | if (value_type == READSTAT_TYPE_DOUBLE) { |
580 | 15.1k | memcpy(&val_d, vlabel->raw_value, 8); |
581 | 15.1k | if (ctx->bswap) |
582 | 14.6k | val_d = byteswap_double(val_d); |
583 | | |
584 | 15.1k | vlabel->final_value.v.double_value = val_d; |
585 | 15.1k | sav_tag_missing_double(&vlabel->final_value, ctx); |
586 | 55.0k | } else { |
587 | 55.0k | retval = readstat_convert(vlabel->utf8_string_value, sizeof(vlabel->utf8_string_value), |
588 | 55.0k | vlabel->raw_value, 8, ctx->converter); |
589 | 55.0k | if (retval != READSTAT_OK) |
590 | 2 | break; |
591 | | |
592 | 55.0k | vlabel->final_value.v.string_value = vlabel->utf8_string_value; |
593 | 55.0k | } |
594 | 70.2k | } |
595 | | |
596 | 18.2k | if (ctx->handle.value_label) { |
597 | 18.2k | sav_submit_value_labels(value_labels, label_count, value_type, ctx); |
598 | 18.2k | } |
599 | 18.2k | ctx->value_labels_count++; |
600 | 18.4k | cleanup: |
601 | 18.4k | if (vars) |
602 | 5.34k | free(vars); |
603 | 18.4k | if (value_labels) { |
604 | 6.06M | for (i=0; i<label_count; i++) { |
605 | 6.06M | value_label_t *vlabel = &value_labels[i]; |
606 | 6.06M | if (vlabel->label) |
607 | 70.2k | free(vlabel->label); |
608 | 6.06M | } |
609 | 1.83k | free(value_labels); |
610 | 1.83k | } |
611 | | |
612 | 18.4k | return retval; |
613 | 18.2k | } |
614 | | |
615 | 1.92k | static readstat_error_t sav_skip_document_record(sav_ctx_t *ctx) { |
616 | 1.92k | uint32_t n_lines; |
617 | 1.92k | readstat_error_t retval = READSTAT_OK; |
618 | 1.92k | readstat_io_t *io = ctx->io; |
619 | 1.92k | if (io->read(&n_lines, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
620 | 3 | retval = READSTAT_ERROR_READ; |
621 | 3 | goto cleanup; |
622 | 3 | } |
623 | 1.92k | if (ctx->bswap) |
624 | 1.48k | n_lines = byteswap4(n_lines); |
625 | 1.92k | if (io->seek(n_lines * SPSS_DOC_LINE_SIZE, READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
626 | 54 | retval = READSTAT_ERROR_SEEK; |
627 | 54 | goto cleanup; |
628 | 54 | } |
629 | | |
630 | 1.92k | cleanup: |
631 | 1.92k | return retval; |
632 | 1.92k | } |
633 | | |
634 | 1.06k | static readstat_error_t sav_read_document_record(sav_ctx_t *ctx) { |
635 | 1.06k | if (!ctx->handle.note) |
636 | 0 | return sav_skip_document_record(ctx); |
637 | | |
638 | 1.06k | uint32_t n_lines; |
639 | 1.06k | readstat_error_t retval = READSTAT_OK; |
640 | 1.06k | readstat_io_t *io = ctx->io; |
641 | 1.06k | if (io->read(&n_lines, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
642 | 4 | retval = READSTAT_ERROR_READ; |
643 | 4 | goto cleanup; |
644 | 4 | } |
645 | 1.05k | if (ctx->bswap) |
646 | 542 | n_lines = byteswap4(n_lines); |
647 | | |
648 | 1.05k | char raw_buffer[SPSS_DOC_LINE_SIZE]; |
649 | 1.05k | char utf8_buffer[4*SPSS_DOC_LINE_SIZE+1]; |
650 | 1.05k | int i; |
651 | 1.45k | for (i=0; i<n_lines; i++) { |
652 | 458 | if (io->read(raw_buffer, SPSS_DOC_LINE_SIZE, io->io_ctx) < SPSS_DOC_LINE_SIZE) { |
653 | 61 | retval = READSTAT_ERROR_READ; |
654 | 61 | goto cleanup; |
655 | 61 | } |
656 | | |
657 | 397 | retval = readstat_convert(utf8_buffer, sizeof(utf8_buffer), |
658 | 397 | raw_buffer, sizeof(raw_buffer), ctx->converter); |
659 | 397 | if (retval != READSTAT_OK) |
660 | 1 | goto cleanup; |
661 | | |
662 | 396 | if (ctx->handle.note(i, utf8_buffer, ctx->user_ctx) != READSTAT_HANDLER_OK) { |
663 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
664 | 0 | goto cleanup; |
665 | 0 | } |
666 | 396 | } |
667 | | |
668 | 1.06k | cleanup: |
669 | 1.06k | return retval; |
670 | 1.05k | } |
671 | | |
672 | 2.76k | static readstat_error_t sav_read_dictionary_termination_record(sav_ctx_t *ctx) { |
673 | 2.76k | int32_t filler; |
674 | 2.76k | readstat_error_t retval = READSTAT_OK; |
675 | 2.76k | readstat_io_t *io = ctx->io; |
676 | 2.76k | if (io->read(&filler, sizeof(int32_t), io->io_ctx) < sizeof(int32_t)) { |
677 | 1.16k | retval = READSTAT_ERROR_READ; |
678 | 1.16k | } |
679 | 2.76k | return retval; |
680 | 2.76k | } |
681 | | |
682 | 2.26M | static readstat_error_t sav_process_row(unsigned char *buffer, size_t buffer_len, sav_ctx_t *ctx) { |
683 | 2.26M | if (ctx->row_offset) { |
684 | 0 | ctx->row_offset--; |
685 | 0 | return READSTAT_OK; |
686 | 0 | } |
687 | | |
688 | 2.26M | readstat_error_t retval = READSTAT_OK; |
689 | 2.26M | double fp_value; |
690 | 2.26M | int offset = 0; |
691 | 2.26M | readstat_off_t data_offset = 0; |
692 | 2.26M | size_t raw_str_used = 0; |
693 | 2.26M | int segment_offset = 0; |
694 | 2.26M | int var_index = 0, col = 0; |
695 | 2.26M | int raw_str_is_utf8 = ctx->input_encoding && !strcmp(ctx->input_encoding, "UTF-8"); |
696 | | |
697 | 4.56M | while (data_offset < buffer_len && col < ctx->var_index && var_index < ctx->var_index) { |
698 | 2.29M | spss_varinfo_t *col_info = ctx->varinfo[col]; |
699 | 2.29M | spss_varinfo_t *var_info = ctx->varinfo[var_index]; |
700 | 2.29M | readstat_value_t value = { .type = var_info->type }; |
701 | 2.29M | if (offset > 31) { |
702 | 2 | retval = READSTAT_ERROR_PARSE; |
703 | 2 | goto done; |
704 | 2 | } |
705 | 2.29M | if (var_info->type == READSTAT_TYPE_STRING) { |
706 | 963k | if (raw_str_used + 8 <= ctx->raw_string_len) { |
707 | 963k | if (raw_str_is_utf8) { |
708 | | /* Skip null bytes, see https://github.com/tidyverse/haven/issues/560 */ |
709 | 920 | char c; |
710 | 8.28k | for (int i=0; i<8; i++) |
711 | 7.36k | if ((c = buffer[data_offset+i])) |
712 | 3.93k | ctx->raw_string[raw_str_used++] = c; |
713 | 962k | } else { |
714 | 962k | memcpy(ctx->raw_string + raw_str_used, &buffer[data_offset], 8); |
715 | 962k | raw_str_used += 8; |
716 | 962k | } |
717 | 963k | } |
718 | 963k | if (++offset == col_info->width) { |
719 | 962k | if (++segment_offset < var_info->n_segments) { |
720 | 2.97k | raw_str_used--; |
721 | 2.97k | } |
722 | 962k | offset = 0; |
723 | 962k | col++; |
724 | 962k | } |
725 | 963k | if (segment_offset == var_info->n_segments) { |
726 | 959k | if (!ctx->variables[var_info->index]->skip) { |
727 | 959k | retval = readstat_convert(ctx->utf8_string, ctx->utf8_string_len, |
728 | 959k | ctx->raw_string, raw_str_used, ctx->converter); |
729 | 959k | if (retval != READSTAT_OK) |
730 | 15 | goto done; |
731 | 959k | value.v.string_value = ctx->utf8_string; |
732 | 959k | if (ctx->handle.value(ctx->current_row, ctx->variables[var_info->index], |
733 | 959k | value, ctx->user_ctx) != READSTAT_HANDLER_OK) { |
734 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
735 | 0 | goto done; |
736 | 0 | } |
737 | 959k | } |
738 | 959k | raw_str_used = 0; |
739 | 959k | segment_offset = 0; |
740 | 959k | var_index += var_info->n_segments; |
741 | 959k | } |
742 | 1.33M | } else if (var_info->type == READSTAT_TYPE_DOUBLE) { |
743 | 1.33M | if (!ctx->variables[var_info->index]->skip) { |
744 | 1.33M | memcpy(&fp_value, &buffer[data_offset], 8); |
745 | 1.33M | if (ctx->bswap) { |
746 | 1.00M | fp_value = byteswap_double(fp_value); |
747 | 1.00M | } |
748 | 1.33M | value.v.double_value = fp_value; |
749 | 1.33M | sav_tag_missing_double(&value, ctx); |
750 | 1.33M | if (ctx->handle.value(ctx->current_row, ctx->variables[var_info->index], |
751 | 1.33M | value, ctx->user_ctx) != READSTAT_HANDLER_OK) { |
752 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
753 | 0 | goto done; |
754 | 0 | } |
755 | 1.33M | } |
756 | 1.33M | var_index += var_info->n_segments; |
757 | 1.33M | col++; |
758 | 1.33M | } |
759 | 2.29M | data_offset += 8; |
760 | 2.29M | } |
761 | 2.26M | ctx->current_row++; |
762 | 2.26M | done: |
763 | 2.26M | return retval; |
764 | 2.26M | } |
765 | | |
766 | 1.52k | static readstat_error_t sav_read_data(sav_ctx_t *ctx) { |
767 | 1.52k | readstat_error_t retval = READSTAT_OK; |
768 | 1.52k | size_t longest_string = 256; |
769 | 1.52k | int i; |
770 | | |
771 | 167k | for (i=0; i<ctx->var_index;) { |
772 | 165k | spss_varinfo_t *info = ctx->varinfo[i]; |
773 | 165k | if (info->string_length > longest_string) { |
774 | 165 | longest_string = info->string_length; |
775 | 165 | } |
776 | 165k | i += info->n_segments; |
777 | 165k | } |
778 | | |
779 | 1.52k | ctx->raw_string_len = longest_string + sizeof(SAV_EIGHT_SPACES)-2; |
780 | 1.52k | ctx->raw_string = readstat_malloc(ctx->raw_string_len); |
781 | | |
782 | 1.52k | ctx->utf8_string_len = 4*longest_string+1 + sizeof(SAV_EIGHT_SPACES)-2; |
783 | 1.52k | ctx->utf8_string = readstat_malloc(ctx->utf8_string_len); |
784 | | |
785 | 1.52k | if (ctx->raw_string == NULL || ctx->utf8_string == NULL) { |
786 | 35 | retval = READSTAT_ERROR_MALLOC; |
787 | 35 | goto done; |
788 | 35 | } |
789 | | |
790 | 1.49k | if (ctx->compression == READSTAT_COMPRESS_ROWS) { |
791 | 416 | retval = sav_read_compressed_data(ctx, &sav_process_row); |
792 | 1.07k | } else if (ctx->compression == READSTAT_COMPRESS_BINARY) { |
793 | 505 | #if HAVE_ZLIB |
794 | 505 | retval = zsav_read_compressed_data(ctx, &sav_process_row); |
795 | | #else |
796 | | retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION; |
797 | | #endif |
798 | 569 | } else { |
799 | 569 | retval = sav_read_uncompressed_data(ctx, &sav_process_row); |
800 | 569 | } |
801 | 1.49k | if (retval != READSTAT_OK) |
802 | 471 | goto done; |
803 | | |
804 | 1.01k | if (ctx->record_count >= 0 && ctx->current_row != ctx->row_limit) { |
805 | 674 | retval = READSTAT_ERROR_ROW_COUNT_MISMATCH; |
806 | 674 | } |
807 | | |
808 | 1.52k | done: |
809 | 1.52k | return retval; |
810 | 1.01k | } |
811 | | |
812 | | static readstat_error_t sav_read_uncompressed_data(sav_ctx_t *ctx, |
813 | 569 | readstat_error_t (*row_handler)(unsigned char *, size_t, sav_ctx_t *)) { |
814 | 569 | readstat_error_t retval = READSTAT_OK; |
815 | 569 | readstat_io_t *io = ctx->io; |
816 | 569 | unsigned char *buffer = NULL; |
817 | 569 | size_t bytes_read = 0; |
818 | 569 | size_t buffer_len = ctx->var_offset * 8; |
819 | | |
820 | 569 | buffer = readstat_malloc(buffer_len); |
821 | | |
822 | 569 | if (ctx->row_offset) { |
823 | 0 | if (io->seek(buffer_len * ctx->row_offset, READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
824 | 0 | retval = READSTAT_ERROR_SEEK; |
825 | 0 | goto done; |
826 | 0 | } |
827 | 0 | ctx->row_offset = 0; |
828 | 0 | } |
829 | | |
830 | 1.23k | while (ctx->row_limit == -1 || ctx->current_row < ctx->row_limit) { |
831 | 1.02k | retval = sav_update_progress(ctx); |
832 | 1.02k | if (retval != READSTAT_OK) |
833 | 0 | goto done; |
834 | | |
835 | 1.02k | if ((bytes_read = io->read(buffer, buffer_len, io->io_ctx)) != buffer_len) |
836 | 346 | goto done; |
837 | | |
838 | 675 | retval = row_handler(buffer, buffer_len, ctx); |
839 | 675 | if (retval != READSTAT_OK) |
840 | 5 | goto done; |
841 | 675 | } |
842 | 569 | done: |
843 | 569 | if (buffer) |
844 | 569 | free(buffer); |
845 | | |
846 | 569 | return retval; |
847 | 569 | } |
848 | | |
849 | | static readstat_error_t sav_read_compressed_data(sav_ctx_t *ctx, |
850 | 416 | readstat_error_t (*row_handler)(unsigned char *, size_t, sav_ctx_t *)) { |
851 | 416 | readstat_error_t retval = READSTAT_OK; |
852 | 416 | readstat_io_t *io = ctx->io; |
853 | 416 | readstat_off_t data_offset = 0; |
854 | 416 | unsigned char buffer[DATA_BUFFER_SIZE]; |
855 | 416 | int buffer_used = 0; |
856 | | |
857 | 416 | size_t uncompressed_row_len = ctx->var_offset * 8; |
858 | 416 | readstat_off_t uncompressed_offset = 0; |
859 | 416 | unsigned char *uncompressed_row = NULL; |
860 | | |
861 | 416 | struct sav_row_stream_s state = { |
862 | 416 | .missing_value = ctx->missing_double, |
863 | 416 | .bias = ctx->bias, |
864 | 416 | .bswap = ctx->bswap }; |
865 | | |
866 | 416 | if (uncompressed_row_len && (uncompressed_row = readstat_malloc(uncompressed_row_len)) == NULL) { |
867 | 0 | retval = READSTAT_ERROR_MALLOC; |
868 | 0 | goto done; |
869 | 0 | } |
870 | | |
871 | 780 | while (1) { |
872 | 780 | retval = sav_update_progress(ctx); |
873 | 780 | if (retval != READSTAT_OK) |
874 | 0 | goto done; |
875 | | |
876 | 780 | buffer_used = io->read(buffer, sizeof(buffer), io->io_ctx); |
877 | 780 | if (buffer_used == -1 || buffer_used == 0 || (buffer_used % 8) != 0) |
878 | 370 | goto done; |
879 | | |
880 | 410 | state.status = SAV_ROW_STREAM_HAVE_DATA; |
881 | 410 | data_offset = 0; |
882 | | |
883 | 2.26M | while (state.status != SAV_ROW_STREAM_NEED_DATA) { |
884 | 2.26M | state.next_in = &buffer[data_offset]; |
885 | 2.26M | state.avail_in = buffer_used - data_offset; |
886 | | |
887 | 2.26M | state.next_out = &uncompressed_row[uncompressed_offset]; |
888 | 2.26M | state.avail_out = uncompressed_row_len - uncompressed_offset; |
889 | | |
890 | 2.26M | sav_decompress_row(&state); |
891 | | |
892 | 2.26M | uncompressed_offset = uncompressed_row_len - state.avail_out; |
893 | 2.26M | data_offset = buffer_used - state.avail_in; |
894 | | |
895 | 2.26M | if (state.status == SAV_ROW_STREAM_FINISHED_ROW) { |
896 | 2.26M | retval = row_handler(uncompressed_row, uncompressed_row_len, ctx); |
897 | 2.26M | if (retval != READSTAT_OK) |
898 | 10 | goto done; |
899 | | |
900 | 2.26M | uncompressed_offset = 0; |
901 | 2.26M | } |
902 | | |
903 | 2.26M | if (state.status == SAV_ROW_STREAM_FINISHED_ALL) |
904 | 26 | goto done; |
905 | 2.26M | if (ctx->row_limit > 0 && ctx->current_row == ctx->row_limit) |
906 | 10 | goto done; |
907 | 2.26M | } |
908 | 410 | } |
909 | | |
910 | 416 | done: |
911 | 416 | if (uncompressed_row) |
912 | 416 | free(uncompressed_row); |
913 | | |
914 | 416 | return retval; |
915 | 416 | } |
916 | | |
917 | 802 | static readstat_error_t sav_parse_machine_integer_info_record(const void *data, size_t data_len, sav_ctx_t *ctx) { |
918 | 802 | if (data_len != 32) |
919 | 10 | return READSTAT_ERROR_PARSE; |
920 | | |
921 | 792 | const char *src_charset = NULL; |
922 | 792 | const char *dst_charset = ctx->output_encoding; |
923 | 792 | sav_machine_integer_info_record_t record; |
924 | 792 | memcpy(&record, data, data_len); |
925 | 792 | if (ctx->bswap) { |
926 | 339 | record.character_code = byteswap4(record.character_code); |
927 | 339 | } |
928 | 792 | if (ctx->input_encoding) { |
929 | 676 | src_charset = ctx->input_encoding; |
930 | 676 | } else { |
931 | 116 | int i; |
932 | 6.93k | for (i=0; i<sizeof(_charset_table)/sizeof(_charset_table[0]); i++) { |
933 | 6.91k | if (record.character_code == _charset_table[i].code) { |
934 | 97 | src_charset = _charset_table[i].name; |
935 | 97 | break; |
936 | 97 | } |
937 | 6.91k | } |
938 | 116 | if (src_charset == NULL) { |
939 | 19 | if (ctx->handle.error) { |
940 | 0 | char error_buf[1024]; |
941 | 0 | snprintf(error_buf, sizeof(error_buf), "Unsupported character set: %d\n", record.character_code); |
942 | 0 | ctx->handle.error(error_buf, ctx->user_ctx); |
943 | 0 | } |
944 | 19 | return READSTAT_ERROR_UNSUPPORTED_CHARSET; |
945 | 19 | } |
946 | 97 | ctx->input_encoding = src_charset; |
947 | 97 | } |
948 | 773 | if (src_charset && dst_charset) { |
949 | | // You might be tempted to skip the charset conversion when src_charset |
950 | | // and dst_charset are the same. However, some versions of SPSS insert |
951 | | // illegally truncated strings (e.g. the last character is three bytes |
952 | | // but the field only has room for two bytes). So to prevent the client |
953 | | // from receiving an invalid byte sequence, we ram everything through |
954 | | // our iconv machinery. |
955 | 773 | iconv_t converter = iconv_open(dst_charset, src_charset); |
956 | 773 | if (converter == (iconv_t)-1) { |
957 | 1 | return READSTAT_ERROR_UNSUPPORTED_CHARSET; |
958 | 1 | } |
959 | 772 | if (ctx->converter) { |
960 | 676 | iconv_close(ctx->converter); |
961 | 676 | } |
962 | 772 | ctx->converter = converter; |
963 | 772 | } |
964 | 772 | return READSTAT_OK; |
965 | 773 | } |
966 | | |
967 | 457 | static readstat_error_t sav_parse_machine_floating_point_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx) { |
968 | 457 | if (size != 8 || count != 3) |
969 | 35 | return READSTAT_ERROR_PARSE; |
970 | | |
971 | 422 | sav_machine_floating_point_info_record_t fp_info; |
972 | 422 | memcpy(&fp_info, data, sizeof(sav_machine_floating_point_info_record_t)); |
973 | | |
974 | 422 | ctx->missing_double = ctx->bswap ? byteswap8(fp_info.sysmis) : fp_info.sysmis; |
975 | 422 | ctx->highest_double = ctx->bswap ? byteswap8(fp_info.highest) : fp_info.highest; |
976 | 422 | ctx->lowest_double = ctx->bswap ? byteswap8(fp_info.lowest) : fp_info.lowest; |
977 | | |
978 | 422 | return READSTAT_OK; |
979 | 457 | } |
980 | | |
981 | | /* We don't yet know how many real variables there are, so store the values in the record |
982 | | * and make sense of them later. */ |
983 | 524 | static readstat_error_t sav_store_variable_display_parameter_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx) { |
984 | 524 | if (size != 4) |
985 | 9 | return READSTAT_ERROR_PARSE; |
986 | | |
987 | 515 | const uint32_t *data_ptr = data; |
988 | 515 | int i; |
989 | | |
990 | 515 | ctx->variable_display_values = readstat_realloc(ctx->variable_display_values, count * sizeof(uint32_t)); |
991 | 515 | if (count > 0 && ctx->variable_display_values == NULL) |
992 | 0 | return READSTAT_ERROR_MALLOC; |
993 | | |
994 | 515 | ctx->variable_display_values_count = count; |
995 | 1.01M | for (i=0; i<count; i++) { |
996 | 1.01M | ctx->variable_display_values[i] = ctx->bswap ? byteswap4(data_ptr[i]) : data_ptr[i]; |
997 | 1.01M | } |
998 | 515 | return READSTAT_OK; |
999 | 515 | } |
1000 | | |
1001 | 1.56k | static readstat_error_t sav_parse_variable_display_parameter_record(sav_ctx_t *ctx) { |
1002 | 1.56k | if (!ctx->variable_display_values) |
1003 | 1.37k | return READSTAT_OK; |
1004 | | |
1005 | 187 | int i; |
1006 | 187 | long count = ctx->variable_display_values_count; |
1007 | 187 | if (count != 2 * ctx->var_index && count != 3 * ctx->var_index) { |
1008 | 36 | return READSTAT_ERROR_PARSE; |
1009 | 36 | } |
1010 | 151 | int has_display_width = ctx->var_index > 0 && (count / ctx->var_index == 3); |
1011 | 151 | int offset = 0; |
1012 | 5.20k | for (i=0; i<ctx->var_index;) { |
1013 | 5.05k | spss_varinfo_t *info = ctx->varinfo[i]; |
1014 | 5.05k | offset = (2 + has_display_width)*i; |
1015 | 5.05k | info->measure = spss_measure_to_readstat_measure(ctx->variable_display_values[offset++]); |
1016 | 5.05k | if (has_display_width) { |
1017 | 730 | info->display_width = ctx->variable_display_values[offset++]; |
1018 | 730 | } |
1019 | 5.05k | info->alignment = spss_alignment_to_readstat_alignment(ctx->variable_display_values[offset++]); |
1020 | | |
1021 | 5.05k | i += info->n_segments; |
1022 | 5.05k | } |
1023 | 151 | return READSTAT_OK; |
1024 | 187 | } |
1025 | | |
1026 | | static readstat_error_t sav_read_pascal_string(char *buf, size_t buf_len, |
1027 | 2.73k | const char **inout_data_ptr, size_t data_ptr_len, sav_ctx_t *ctx) { |
1028 | 2.73k | const char *data_ptr = *inout_data_ptr; |
1029 | 2.73k | const char *data_end = data_ptr + data_ptr_len; |
1030 | 2.73k | readstat_error_t retval = READSTAT_OK; |
1031 | 2.73k | uint32_t var_name_len = 0; |
1032 | | |
1033 | 2.73k | if (data_ptr + sizeof(uint32_t) > data_end) { |
1034 | 29 | retval = READSTAT_ERROR_PARSE; |
1035 | 29 | goto cleanup; |
1036 | 29 | } |
1037 | | |
1038 | 2.70k | memcpy(&var_name_len, data_ptr, sizeof(uint32_t)); |
1039 | 2.70k | if (ctx->bswap) |
1040 | 1.31k | var_name_len = byteswap4(var_name_len); |
1041 | | |
1042 | 2.70k | data_ptr += sizeof(uint32_t); |
1043 | | |
1044 | 2.70k | if (data_ptr + var_name_len > data_end) { |
1045 | 33 | retval = READSTAT_ERROR_PARSE; |
1046 | 33 | goto cleanup; |
1047 | 33 | } |
1048 | | |
1049 | 2.67k | retval = readstat_convert(buf, buf_len, data_ptr, var_name_len, NULL); |
1050 | 2.67k | if (retval != READSTAT_OK) |
1051 | 17 | goto cleanup; |
1052 | | |
1053 | 2.65k | data_ptr += var_name_len; |
1054 | | |
1055 | 2.73k | cleanup: |
1056 | 2.73k | *inout_data_ptr = data_ptr; |
1057 | | |
1058 | 2.73k | return retval; |
1059 | 2.65k | } |
1060 | | |
1061 | 719 | static readstat_error_t sav_parse_long_string_value_labels_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx) { |
1062 | 719 | if (!ctx->handle.value_label) |
1063 | 0 | return READSTAT_OK; |
1064 | 719 | if (size != 1) |
1065 | 14 | return READSTAT_ERROR_PARSE; |
1066 | | |
1067 | 705 | readstat_error_t retval = READSTAT_OK; |
1068 | 705 | uint32_t label_count = 0; |
1069 | 705 | uint32_t i = 0; |
1070 | 705 | const char *data_ptr = data; |
1071 | 705 | const char *data_end = data_ptr + count; |
1072 | 705 | char var_name_buf[256+1]; // unconverted |
1073 | 705 | char label_name_buf[256]; |
1074 | 705 | char *value_buffer = NULL; |
1075 | 705 | char *label_buffer = NULL; |
1076 | | |
1077 | 2.31k | while (data_ptr < data_end) { |
1078 | 1.85k | memset(label_name_buf, '\0', sizeof(label_name_buf)); |
1079 | | |
1080 | 1.85k | retval = sav_read_pascal_string(var_name_buf, sizeof(var_name_buf), |
1081 | 1.85k | &data_ptr, data_end - data_ptr, ctx); |
1082 | 1.85k | if (retval != READSTAT_OK) |
1083 | 42 | goto cleanup; |
1084 | | |
1085 | 19.5k | for (i=0; i<ctx->var_index;) { |
1086 | 19.4k | spss_varinfo_t *info = ctx->varinfo[i]; |
1087 | 19.4k | if (strcmp(var_name_buf, info->longname) == 0) { |
1088 | 1.71k | info->labels_index = ctx->value_labels_count++; |
1089 | 1.71k | snprintf(label_name_buf, sizeof(label_name_buf), |
1090 | 1.71k | SAV_LABEL_NAME_PREFIX "%d", info->labels_index); |
1091 | 1.71k | break; |
1092 | 1.71k | } |
1093 | 17.7k | i += info->n_segments; |
1094 | 17.7k | } |
1095 | | |
1096 | 1.81k | if (label_name_buf[0] == '\0') { |
1097 | 94 | retval = READSTAT_ERROR_PARSE; |
1098 | 94 | goto cleanup; |
1099 | 94 | } |
1100 | | |
1101 | 1.71k | data_ptr += sizeof(uint32_t); |
1102 | | |
1103 | 1.71k | if (data_ptr + sizeof(uint32_t) > data_end) { |
1104 | 13 | retval = READSTAT_ERROR_PARSE; |
1105 | 13 | goto cleanup; |
1106 | 13 | } |
1107 | | |
1108 | 1.70k | memcpy(&label_count, data_ptr, sizeof(uint32_t)); |
1109 | 1.70k | if (ctx->bswap) |
1110 | 869 | label_count = byteswap4(label_count); |
1111 | | |
1112 | 1.70k | data_ptr += sizeof(uint32_t); |
1113 | | |
1114 | 2.94k | for (i=0; i<label_count; i++) { |
1115 | 1.33k | uint32_t value_len = 0, label_len = 0; |
1116 | 1.33k | uint32_t value_buffer_len = 0, label_buffer_len = 0; |
1117 | | |
1118 | 1.33k | if (data_ptr + sizeof(uint32_t) > data_end) { |
1119 | 55 | retval = READSTAT_ERROR_PARSE; |
1120 | 55 | goto cleanup; |
1121 | 55 | } |
1122 | | |
1123 | 1.27k | memcpy(&value_len, data_ptr, sizeof(uint32_t)); |
1124 | 1.27k | if (ctx->bswap) |
1125 | 745 | value_len = byteswap4(value_len); |
1126 | | |
1127 | 1.27k | data_ptr += sizeof(uint32_t); |
1128 | | |
1129 | 1.27k | value_buffer_len = value_len*4+1; |
1130 | 1.27k | value_buffer = readstat_realloc(value_buffer, value_buffer_len); |
1131 | 1.27k | if (value_buffer == NULL) { |
1132 | 11 | retval = READSTAT_ERROR_MALLOC; |
1133 | 11 | goto cleanup; |
1134 | 11 | } |
1135 | | |
1136 | 1.26k | if (data_ptr + value_len > data_end) { |
1137 | 6 | retval = READSTAT_ERROR_PARSE; |
1138 | 6 | goto cleanup; |
1139 | 6 | } |
1140 | | |
1141 | 1.25k | retval = readstat_convert(value_buffer, value_buffer_len, data_ptr, value_len, ctx->converter); |
1142 | 1.25k | if (retval != READSTAT_OK) |
1143 | 1 | goto cleanup; |
1144 | | |
1145 | 1.25k | data_ptr += value_len; |
1146 | | |
1147 | 1.25k | if (data_ptr + sizeof(uint32_t) > data_end) { |
1148 | 7 | retval = READSTAT_ERROR_PARSE; |
1149 | 7 | goto cleanup; |
1150 | 7 | } |
1151 | | |
1152 | 1.25k | memcpy(&label_len, data_ptr, sizeof(uint32_t)); |
1153 | 1.25k | if (ctx->bswap) |
1154 | 738 | label_len = byteswap4(label_len); |
1155 | | |
1156 | 1.25k | data_ptr += sizeof(uint32_t); |
1157 | | |
1158 | 1.25k | label_buffer_len = label_len*4+1; |
1159 | 1.25k | label_buffer = readstat_realloc(label_buffer, label_buffer_len); |
1160 | 1.25k | if (label_buffer == NULL) { |
1161 | 4 | retval = READSTAT_ERROR_MALLOC; |
1162 | 4 | goto cleanup; |
1163 | 4 | } |
1164 | | |
1165 | 1.24k | if (data_ptr + label_len > data_end) { |
1166 | 7 | retval = READSTAT_ERROR_PARSE; |
1167 | 7 | goto cleanup; |
1168 | 7 | } |
1169 | | |
1170 | 1.24k | retval = readstat_convert(label_buffer, label_buffer_len, data_ptr, label_len, ctx->converter); |
1171 | 1.24k | if (retval != READSTAT_OK) |
1172 | 1 | goto cleanup; |
1173 | | |
1174 | 1.23k | data_ptr += label_len; |
1175 | | |
1176 | 1.23k | readstat_value_t value = { .type = READSTAT_TYPE_STRING }; |
1177 | 1.23k | value.v.string_value = value_buffer; |
1178 | | |
1179 | 1.23k | if (ctx->handle.value_label(label_name_buf, value, label_buffer, ctx->user_ctx) != READSTAT_HANDLER_OK) { |
1180 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
1181 | 0 | goto cleanup; |
1182 | 0 | } |
1183 | 1.23k | } |
1184 | 1.70k | } |
1185 | | |
1186 | 464 | if (data_ptr != data_end) { |
1187 | 0 | retval = READSTAT_ERROR_PARSE; |
1188 | 0 | } |
1189 | | |
1190 | 705 | cleanup: |
1191 | 705 | if (value_buffer) |
1192 | 280 | free(value_buffer); |
1193 | 705 | if (label_buffer) |
1194 | 279 | free(label_buffer); |
1195 | 705 | return retval; |
1196 | 464 | } |
1197 | | |
1198 | 470 | static readstat_error_t sav_parse_long_string_missing_values_record(const void *data, size_t size, size_t count, sav_ctx_t *ctx) { |
1199 | 470 | if (size != 1) |
1200 | 14 | return READSTAT_ERROR_PARSE; |
1201 | | |
1202 | 456 | readstat_error_t retval = READSTAT_OK; |
1203 | 456 | uint32_t i = 0, j = 0; |
1204 | 456 | const char *data_ptr = data; |
1205 | 456 | const char *data_end = data_ptr + count; |
1206 | 456 | char var_name_buf[256+1]; |
1207 | | |
1208 | 1.15k | while (data_ptr < data_end) { |
1209 | 879 | retval = sav_read_pascal_string(var_name_buf, sizeof(var_name_buf), |
1210 | 879 | &data_ptr, data_end - data_ptr, ctx); |
1211 | 879 | if (retval != READSTAT_OK) |
1212 | 37 | goto cleanup; |
1213 | | |
1214 | 842 | if (data_ptr == data_end) { |
1215 | 20 | retval = READSTAT_ERROR_PARSE; |
1216 | 20 | goto cleanup; |
1217 | 20 | } |
1218 | | |
1219 | 822 | char n_missing_values = *data_ptr++; |
1220 | 822 | if (n_missing_values < 1 || n_missing_values > 3) { |
1221 | 29 | retval = READSTAT_ERROR_PARSE; |
1222 | 29 | goto cleanup; |
1223 | 29 | } |
1224 | | |
1225 | 118k | for (i=0; i<ctx->var_index;) { |
1226 | 117k | spss_varinfo_t *info = ctx->varinfo[i]; |
1227 | 117k | if (strcmp(var_name_buf, info->longname) == 0) { |
1228 | 711 | info->n_missing_values = n_missing_values; |
1229 | | |
1230 | 711 | uint32_t var_name_len = 0; |
1231 | | |
1232 | 711 | if (data_ptr + sizeof(uint32_t) > data_end) { |
1233 | 7 | retval = READSTAT_ERROR_PARSE; |
1234 | 7 | goto cleanup; |
1235 | 7 | } |
1236 | | |
1237 | 704 | memcpy(&var_name_len, data_ptr, sizeof(uint32_t)); |
1238 | 704 | if (ctx->bswap) |
1239 | 335 | var_name_len = byteswap4(var_name_len); |
1240 | | |
1241 | 704 | data_ptr += sizeof(uint32_t); |
1242 | | |
1243 | 2.57k | for (j=0; j<n_missing_values; j++) { |
1244 | 1.87k | if (data_ptr + var_name_len > data_end) { |
1245 | 5 | retval = READSTAT_ERROR_PARSE; |
1246 | 5 | goto cleanup; |
1247 | 5 | } |
1248 | | |
1249 | 1.87k | retval = readstat_convert(info->missing_string_values[j], |
1250 | 1.87k | sizeof(info->missing_string_values[0]), |
1251 | 1.87k | data_ptr, var_name_len, ctx->converter); |
1252 | 1.87k | if (retval != READSTAT_OK) |
1253 | 5 | goto cleanup; |
1254 | | |
1255 | 1.86k | data_ptr += var_name_len; |
1256 | 1.86k | } |
1257 | 694 | break; |
1258 | 704 | } |
1259 | 117k | i += info->n_segments; |
1260 | 117k | } |
1261 | 776 | if (i == ctx->var_index) { |
1262 | 82 | retval = READSTAT_ERROR_PARSE; |
1263 | 82 | goto cleanup; |
1264 | 82 | } |
1265 | 776 | } |
1266 | | |
1267 | 271 | if (data_ptr != data_end) { |
1268 | 0 | retval = READSTAT_ERROR_PARSE; |
1269 | 0 | } |
1270 | | |
1271 | 456 | cleanup: |
1272 | 456 | return retval; |
1273 | 271 | } |
1274 | | |
1275 | 5.26k | static readstat_error_t sav_parse_records_pass1(sav_ctx_t *ctx) { |
1276 | 5.26k | char data_buf[4096]; |
1277 | 5.26k | readstat_error_t retval = READSTAT_OK; |
1278 | 5.26k | readstat_io_t *io = ctx->io; |
1279 | 325k | while (1) { |
1280 | 325k | uint32_t rec_type; |
1281 | 325k | uint32_t extra_info[3]; |
1282 | 325k | size_t data_len = 0; |
1283 | 325k | int i; |
1284 | 325k | int done = 0; |
1285 | 325k | if (io->read(&rec_type, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
1286 | 294 | retval = READSTAT_ERROR_READ; |
1287 | 294 | goto cleanup; |
1288 | 294 | } |
1289 | | |
1290 | 325k | if (ctx->bswap) { |
1291 | 38.9k | rec_type = byteswap4(rec_type); |
1292 | 38.9k | } |
1293 | | |
1294 | 325k | switch (rec_type) { |
1295 | 281k | case SAV_RECORD_TYPE_VARIABLE: |
1296 | 281k | retval = sav_skip_variable_record(ctx); |
1297 | 281k | if (retval != READSTAT_OK) |
1298 | 145 | goto cleanup; |
1299 | 281k | break; |
1300 | 281k | case SAV_RECORD_TYPE_VALUE_LABEL: |
1301 | 28.5k | retval = sav_skip_value_label_record(ctx); |
1302 | 28.5k | if (retval != READSTAT_OK) |
1303 | 187 | goto cleanup; |
1304 | 28.4k | break; |
1305 | 28.4k | case SAV_RECORD_TYPE_DOCUMENT: |
1306 | 1.92k | retval = sav_skip_document_record(ctx); |
1307 | 1.92k | if (retval != READSTAT_OK) |
1308 | 57 | goto cleanup; |
1309 | 1.86k | break; |
1310 | 4.27k | case SAV_RECORD_TYPE_DICT_TERMINATION: |
1311 | 4.27k | done = 1; |
1312 | 4.27k | break; |
1313 | 8.47k | case SAV_RECORD_TYPE_HAS_DATA: |
1314 | 8.47k | if (io->read(extra_info, sizeof(extra_info), io->io_ctx) < sizeof(extra_info)) { |
1315 | 8 | retval = READSTAT_ERROR_READ; |
1316 | 8 | goto cleanup; |
1317 | 8 | } |
1318 | 8.46k | if (ctx->bswap) { |
1319 | 8.97k | for (i=0; i<3; i++) |
1320 | 6.72k | extra_info[i] = byteswap4(extra_info[i]); |
1321 | 2.24k | } |
1322 | 8.46k | uint32_t subtype = extra_info[0]; |
1323 | 8.46k | size_t size = extra_info[1]; |
1324 | 8.46k | size_t count = extra_info[2]; |
1325 | 8.46k | data_len = size * count; |
1326 | 8.46k | if (subtype == SAV_RECORD_SUBTYPE_INTEGER_INFO) { |
1327 | 882 | if (data_len > sizeof(data_buf)) { |
1328 | 63 | retval = READSTAT_ERROR_PARSE; |
1329 | 63 | goto cleanup; |
1330 | 63 | } |
1331 | 819 | if (io->read(data_buf, data_len, io->io_ctx) < data_len) { |
1332 | 17 | retval = READSTAT_ERROR_PARSE; |
1333 | 17 | goto cleanup; |
1334 | 17 | } |
1335 | 802 | retval = sav_parse_machine_integer_info_record(data_buf, data_len, ctx); |
1336 | 802 | if (retval != READSTAT_OK) |
1337 | 30 | goto cleanup; |
1338 | 7.58k | } else { |
1339 | 7.58k | if (io->seek(data_len, READSTAT_SEEK_CUR, io->io_ctx) == -1) { |
1340 | 115 | retval = READSTAT_ERROR_SEEK; |
1341 | 115 | goto cleanup; |
1342 | 115 | } |
1343 | 7.58k | } |
1344 | 8.24k | break; |
1345 | 8.24k | default: |
1346 | 77 | retval = READSTAT_ERROR_PARSE; |
1347 | 77 | goto cleanup; |
1348 | 0 | break; |
1349 | 325k | } |
1350 | 324k | if (done) |
1351 | 4.27k | break; |
1352 | 324k | } |
1353 | 5.26k | cleanup: |
1354 | 5.26k | return retval; |
1355 | 5.26k | } |
1356 | | |
1357 | 4.27k | static readstat_error_t sav_parse_records_pass2(sav_ctx_t *ctx) { |
1358 | 4.27k | void *data_buf = NULL; |
1359 | 4.27k | size_t data_buf_capacity = 4096; |
1360 | 4.27k | readstat_error_t retval = READSTAT_OK; |
1361 | 4.27k | readstat_io_t *io = ctx->io; |
1362 | | |
1363 | 4.27k | if ((data_buf = readstat_malloc(data_buf_capacity)) == NULL) { |
1364 | 0 | retval = READSTAT_ERROR_MALLOC; |
1365 | 0 | goto cleanup; |
1366 | 0 | } |
1367 | | |
1368 | 331k | while (1) { |
1369 | 331k | uint32_t rec_type; |
1370 | 331k | uint32_t extra_info[3]; |
1371 | 331k | size_t data_len = 0; |
1372 | 331k | int i; |
1373 | 331k | int done = 0; |
1374 | 331k | if (io->read(&rec_type, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) { |
1375 | 33 | retval = READSTAT_ERROR_READ; |
1376 | 33 | goto cleanup; |
1377 | 33 | } |
1378 | | |
1379 | 331k | if (ctx->bswap) { |
1380 | 23.7k | rec_type = byteswap4(rec_type); |
1381 | 23.7k | } |
1382 | | |
1383 | 331k | switch (rec_type) { |
1384 | 302k | case SAV_RECORD_TYPE_VARIABLE: |
1385 | 302k | if ((retval = sav_read_variable_record(ctx)) != READSTAT_OK) |
1386 | 172 | goto cleanup; |
1387 | 302k | break; |
1388 | 302k | case SAV_RECORD_TYPE_VALUE_LABEL: |
1389 | 18.4k | if ((retval = sav_read_value_label_record(ctx)) != READSTAT_OK) |
1390 | 179 | goto cleanup; |
1391 | 18.2k | break; |
1392 | 18.2k | case SAV_RECORD_TYPE_DOCUMENT: |
1393 | 1.06k | if ((retval = sav_read_document_record(ctx)) != READSTAT_OK) |
1394 | 66 | goto cleanup; |
1395 | 996 | break; |
1396 | 2.76k | case SAV_RECORD_TYPE_DICT_TERMINATION: |
1397 | 2.76k | if ((retval = sav_read_dictionary_termination_record(ctx)) != READSTAT_OK) |
1398 | 1.16k | goto cleanup; |
1399 | 1.60k | done = 1; |
1400 | 1.60k | break; |
1401 | 6.74k | case SAV_RECORD_TYPE_HAS_DATA: |
1402 | 6.74k | if (io->read(extra_info, sizeof(extra_info), io->io_ctx) < sizeof(extra_info)) { |
1403 | 7 | retval = READSTAT_ERROR_READ; |
1404 | 7 | goto cleanup; |
1405 | 7 | } |
1406 | 6.74k | if (ctx->bswap) { |
1407 | 6.89k | for (i=0; i<3; i++) |
1408 | 5.16k | extra_info[i] = byteswap4(extra_info[i]); |
1409 | 1.72k | } |
1410 | 6.74k | uint32_t subtype = extra_info[0]; |
1411 | 6.74k | size_t size = extra_info[1]; |
1412 | 6.74k | size_t count = extra_info[2]; |
1413 | 6.74k | data_len = size * count; |
1414 | 6.74k | if (data_buf_capacity < data_len) { |
1415 | 171 | if ((data_buf = readstat_realloc(data_buf, data_buf_capacity = data_len)) == NULL) { |
1416 | 65 | retval = READSTAT_ERROR_MALLOC; |
1417 | 65 | goto cleanup; |
1418 | 65 | } |
1419 | 171 | } |
1420 | 6.67k | if (data_len == 0 || io->read(data_buf, data_len, io->io_ctx) < data_len) { |
1421 | 58 | retval = READSTAT_ERROR_PARSE; |
1422 | 58 | goto cleanup; |
1423 | 58 | } |
1424 | | |
1425 | 6.61k | switch (subtype) { |
1426 | 67 | case SAV_RECORD_SUBTYPE_INTEGER_INFO: |
1427 | | /* parsed in pass 1 */ |
1428 | 67 | break; |
1429 | 457 | case SAV_RECORD_SUBTYPE_FP_INFO: |
1430 | 457 | retval = sav_parse_machine_floating_point_record(data_buf, size, count, ctx); |
1431 | 457 | if (retval != READSTAT_OK) |
1432 | 35 | goto cleanup; |
1433 | 422 | break; |
1434 | 524 | case SAV_RECORD_SUBTYPE_VAR_DISPLAY: |
1435 | 524 | retval = sav_store_variable_display_parameter_record(data_buf, size, count, ctx); |
1436 | 524 | if (retval != READSTAT_OK) |
1437 | 9 | goto cleanup; |
1438 | 515 | break; |
1439 | 1.11k | case SAV_RECORD_SUBTYPE_LONG_VAR_NAME: |
1440 | 1.11k | retval = sav_parse_long_variable_names_record(data_buf, count, ctx); |
1441 | 1.11k | if (retval != READSTAT_OK) |
1442 | 175 | goto cleanup; |
1443 | 937 | break; |
1444 | 2.22k | case SAV_RECORD_SUBTYPE_VERY_LONG_STR: |
1445 | 2.22k | retval = sav_parse_very_long_string_record(data_buf, count, ctx); |
1446 | 2.22k | if (retval != READSTAT_OK) |
1447 | 178 | goto cleanup; |
1448 | 2.04k | break; |
1449 | 2.04k | case SAV_RECORD_SUBTYPE_LONG_STRING_VALUE_LABELS: |
1450 | 719 | retval = sav_parse_long_string_value_labels_record(data_buf, size, count, ctx); |
1451 | 719 | if (retval != READSTAT_OK) |
1452 | 255 | goto cleanup; |
1453 | 464 | break; |
1454 | 470 | case SAV_RECORD_SUBTYPE_LONG_STRING_MISSING_VALUES: |
1455 | 470 | retval = sav_parse_long_string_missing_values_record(data_buf, size, count, ctx); |
1456 | 470 | if (retval != READSTAT_OK) |
1457 | 199 | goto cleanup; |
1458 | 271 | break; |
1459 | 1.04k | default: /* misc. info */ |
1460 | 1.04k | break; |
1461 | 6.61k | } |
1462 | 5.76k | break; |
1463 | 5.76k | default: |
1464 | 71 | retval = READSTAT_ERROR_PARSE; |
1465 | 71 | goto cleanup; |
1466 | 0 | break; |
1467 | 331k | } |
1468 | 328k | if (done) |
1469 | 1.60k | break; |
1470 | 328k | } |
1471 | 4.27k | cleanup: |
1472 | 4.27k | if (data_buf) |
1473 | 4.20k | free(data_buf); |
1474 | 4.27k | return retval; |
1475 | 4.27k | } |
1476 | | |
1477 | 1.60k | static readstat_error_t sav_set_n_segments_and_var_count(sav_ctx_t *ctx) { |
1478 | 1.60k | int i; |
1479 | 1.60k | ctx->var_count = 0; |
1480 | 167k | for (i=0; i<ctx->var_index;) { |
1481 | 166k | spss_varinfo_t *info = ctx->varinfo[i]; |
1482 | 166k | if (info->string_length > VERY_LONG_STRING_MAX_LENGTH) |
1483 | 32 | return READSTAT_ERROR_PARSE; |
1484 | 166k | if (info->string_length) { |
1485 | 413 | info->n_segments = (info->string_length + 251) / 252; |
1486 | 413 | } |
1487 | 166k | info->index = ctx->var_count++; |
1488 | 166k | i += info->n_segments; |
1489 | 166k | } |
1490 | 1.57k | ctx->variables = readstat_calloc(ctx->var_count, sizeof(readstat_variable_t *)); |
1491 | 1.57k | return READSTAT_OK; |
1492 | 1.60k | } |
1493 | | |
1494 | 1.52k | static readstat_error_t sav_handle_variables(sav_ctx_t *ctx) { |
1495 | 1.52k | int i; |
1496 | 1.52k | int index_after_skipping = 0; |
1497 | 1.52k | readstat_error_t retval = READSTAT_OK; |
1498 | | |
1499 | 1.52k | if (!ctx->handle.variable) |
1500 | 0 | return retval; |
1501 | | |
1502 | 167k | for (i=0; i<ctx->var_index;) { |
1503 | 165k | char label_name_buf[256]; |
1504 | 165k | spss_varinfo_t *info = ctx->varinfo[i]; |
1505 | 165k | ctx->variables[info->index] = spss_init_variable_for_info(info, index_after_skipping, ctx->converter); |
1506 | | |
1507 | 165k | snprintf(label_name_buf, sizeof(label_name_buf), SAV_LABEL_NAME_PREFIX "%d", info->labels_index); |
1508 | | |
1509 | 165k | int cb_retval = ctx->handle.variable(info->index, ctx->variables[info->index], |
1510 | 165k | info->labels_index == -1 ? NULL : label_name_buf, |
1511 | 165k | ctx->user_ctx); |
1512 | | |
1513 | 165k | if (cb_retval == READSTAT_HANDLER_ABORT) { |
1514 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
1515 | 0 | goto cleanup; |
1516 | 0 | } |
1517 | | |
1518 | 165k | if (cb_retval == READSTAT_HANDLER_SKIP_VARIABLE) { |
1519 | 0 | ctx->variables[info->index]->skip = 1; |
1520 | 165k | } else { |
1521 | 165k | index_after_skipping++; |
1522 | 165k | } |
1523 | | |
1524 | 165k | i += info->n_segments; |
1525 | 165k | } |
1526 | 1.52k | cleanup: |
1527 | 1.52k | return retval; |
1528 | 1.52k | } |
1529 | | |
1530 | 1.52k | static readstat_error_t sav_handle_fweight(sav_ctx_t *ctx) { |
1531 | 1.52k | readstat_error_t retval = READSTAT_OK; |
1532 | 1.52k | int i; |
1533 | 1.52k | if (ctx->handle.fweight && ctx->fweight_index >= 0) { |
1534 | 109k | for (i=0; i<ctx->var_index;) { |
1535 | 108k | spss_varinfo_t *info = ctx->varinfo[i]; |
1536 | 108k | if (info->offset == ctx->fweight_index - 1) { |
1537 | 18 | if (ctx->handle.fweight(ctx->variables[info->index], ctx->user_ctx) != READSTAT_HANDLER_OK) { |
1538 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
1539 | 0 | goto cleanup; |
1540 | 0 | } |
1541 | 18 | break; |
1542 | 18 | } |
1543 | 108k | i += info->n_segments; |
1544 | 108k | } |
1545 | 1.08k | } |
1546 | 1.52k | cleanup: |
1547 | 1.52k | return retval; |
1548 | 1.52k | } |
1549 | | |
1550 | 5.26k | readstat_error_t sav_parse_timestamp(sav_ctx_t *ctx, sav_file_header_record_t *header) { |
1551 | 5.26k | readstat_error_t retval = READSTAT_OK; |
1552 | 5.26k | struct tm timestamp = { .tm_isdst = -1 }; |
1553 | | |
1554 | 5.26k | if ((retval = sav_parse_time(header->creation_time, sizeof(header->creation_time), |
1555 | 5.26k | ×tamp, ctx->handle.error, ctx->user_ctx)) |
1556 | 5.26k | != READSTAT_OK) |
1557 | 3.57k | goto cleanup; |
1558 | | |
1559 | 1.69k | if ((retval = sav_parse_date(header->creation_date, sizeof(header->creation_date), |
1560 | 1.69k | ×tamp, ctx->handle.error, ctx->user_ctx)) |
1561 | 1.69k | != READSTAT_OK) |
1562 | 429 | goto cleanup; |
1563 | | |
1564 | 1.26k | ctx->timestamp = mktime(×tamp); |
1565 | | |
1566 | 5.26k | cleanup: |
1567 | 5.26k | return retval; |
1568 | 1.26k | } |
1569 | | |
1570 | 5.31k | readstat_error_t readstat_parse_sav(readstat_parser_t *parser, const char *path, void *user_ctx) { |
1571 | 5.31k | readstat_error_t retval = READSTAT_OK; |
1572 | 5.31k | readstat_io_t *io = parser->io; |
1573 | 5.31k | sav_file_header_record_t header; |
1574 | 5.31k | sav_ctx_t *ctx = NULL; |
1575 | 5.31k | size_t file_size = 0; |
1576 | | |
1577 | 5.31k | if (io->open(path, io->io_ctx) == -1) { |
1578 | 0 | return READSTAT_ERROR_OPEN; |
1579 | 0 | } |
1580 | | |
1581 | 5.31k | file_size = io->seek(0, READSTAT_SEEK_END, io->io_ctx); |
1582 | 5.31k | if (file_size == -1) { |
1583 | 0 | retval = READSTAT_ERROR_SEEK; |
1584 | 0 | goto cleanup; |
1585 | 0 | } |
1586 | | |
1587 | 5.31k | if (io->seek(0, READSTAT_SEEK_SET, io->io_ctx) == -1) { |
1588 | 0 | retval = READSTAT_ERROR_SEEK; |
1589 | 0 | goto cleanup; |
1590 | 0 | } |
1591 | | |
1592 | 5.31k | if (io->read(&header, sizeof(sav_file_header_record_t), io->io_ctx) < sizeof(sav_file_header_record_t)) { |
1593 | 18 | retval = READSTAT_ERROR_READ; |
1594 | 18 | goto cleanup; |
1595 | 18 | } |
1596 | | |
1597 | 5.29k | ctx = sav_ctx_init(&header, io); |
1598 | 5.29k | if (ctx == NULL) { |
1599 | 31 | retval = READSTAT_ERROR_PARSE; |
1600 | 31 | goto cleanup; |
1601 | 31 | } |
1602 | | |
1603 | 5.26k | ctx->handle = parser->handlers; |
1604 | 5.26k | ctx->input_encoding = parser->input_encoding; |
1605 | 5.26k | ctx->output_encoding = parser->output_encoding; |
1606 | 5.26k | ctx->user_ctx = user_ctx; |
1607 | 5.26k | ctx->file_size = file_size; |
1608 | 5.26k | if (parser->row_offset > 0) |
1609 | 0 | ctx->row_offset = parser->row_offset; |
1610 | 5.26k | if (ctx->record_count >= 0) { |
1611 | 3.83k | int record_count_after_skipping = ctx->record_count - ctx->row_offset; |
1612 | 3.83k | if (record_count_after_skipping < 0) { |
1613 | 0 | record_count_after_skipping = 0; |
1614 | 0 | ctx->row_offset = ctx->record_count; |
1615 | 0 | } |
1616 | 3.83k | ctx->row_limit = record_count_after_skipping; |
1617 | 3.83k | if (parser->row_limit > 0 && parser->row_limit < record_count_after_skipping) |
1618 | 0 | ctx->row_limit = parser->row_limit; |
1619 | 3.83k | } else if (parser->row_limit > 0) { |
1620 | 0 | ctx->row_limit = parser->row_limit; |
1621 | 0 | } |
1622 | | |
1623 | | /* ignore errors */ |
1624 | 5.26k | sav_parse_timestamp(ctx, &header); |
1625 | | |
1626 | 5.26k | if ((retval = sav_parse_records_pass1(ctx)) != READSTAT_OK) |
1627 | 993 | goto cleanup; |
1628 | | |
1629 | 4.27k | if (io->seek(sizeof(sav_file_header_record_t), READSTAT_SEEK_SET, io->io_ctx) == -1) { |
1630 | 0 | retval = READSTAT_ERROR_SEEK; |
1631 | 0 | goto cleanup; |
1632 | 0 | } |
1633 | | |
1634 | 4.27k | if ((retval = sav_update_progress(ctx)) != READSTAT_OK) |
1635 | 0 | goto cleanup; |
1636 | | |
1637 | 4.27k | if ((retval = sav_parse_records_pass2(ctx)) != READSTAT_OK) |
1638 | 2.66k | goto cleanup; |
1639 | | |
1640 | 1.60k | if ((retval = sav_set_n_segments_and_var_count(ctx)) != READSTAT_OK) |
1641 | 32 | goto cleanup; |
1642 | | |
1643 | 1.57k | if (ctx->var_count == 0) { |
1644 | 9 | retval = READSTAT_ERROR_PARSE; |
1645 | 9 | goto cleanup; |
1646 | 9 | } |
1647 | | |
1648 | 1.56k | if (ctx->handle.metadata) { |
1649 | 1.56k | readstat_metadata_t metadata = { |
1650 | 1.56k | .row_count = ctx->record_count < 0 ? -1 : ctx->row_limit, |
1651 | 1.56k | .var_count = ctx->var_count, |
1652 | 1.56k | .file_encoding = ctx->input_encoding, |
1653 | 1.56k | .file_format_version = ctx->format_version, |
1654 | 1.56k | .creation_time = ctx->timestamp, |
1655 | 1.56k | .modified_time = ctx->timestamp, |
1656 | 1.56k | .compression = ctx->compression, |
1657 | 1.56k | .endianness = ctx->endianness |
1658 | 1.56k | }; |
1659 | 1.56k | if ((retval = readstat_convert(ctx->file_label, sizeof(ctx->file_label), |
1660 | 1.56k | header.file_label, sizeof(header.file_label), ctx->converter)) != READSTAT_OK) |
1661 | 1 | goto cleanup; |
1662 | | |
1663 | 1.56k | metadata.file_label = ctx->file_label; |
1664 | | |
1665 | 1.56k | if (ctx->handle.metadata(&metadata, ctx->user_ctx) != READSTAT_HANDLER_OK) { |
1666 | 0 | retval = READSTAT_ERROR_USER_ABORT; |
1667 | 0 | goto cleanup; |
1668 | 0 | } |
1669 | 1.56k | } |
1670 | | |
1671 | 1.56k | if ((retval = sav_parse_variable_display_parameter_record(ctx)) != READSTAT_OK) |
1672 | 36 | goto cleanup; |
1673 | | |
1674 | 1.52k | if ((retval = sav_handle_variables(ctx)) != READSTAT_OK) |
1675 | 0 | goto cleanup; |
1676 | | |
1677 | 1.52k | if ((retval = sav_handle_fweight(ctx)) != READSTAT_OK) |
1678 | 0 | goto cleanup; |
1679 | | |
1680 | 1.52k | if (ctx->handle.value) { |
1681 | 1.52k | retval = sav_read_data(ctx); |
1682 | 1.52k | } |
1683 | | |
1684 | 5.31k | cleanup: |
1685 | 5.31k | io->close(io->io_ctx); |
1686 | 5.31k | if (ctx) |
1687 | 5.26k | sav_ctx_free(ctx); |
1688 | | |
1689 | 5.31k | return retval; |
1690 | 1.52k | } |