/src/readstat/src/stata/readstat_dta.c
Line | Count | Source |
1 | | #include <stdlib.h> |
2 | | #include <math.h> |
3 | | #include <stdint.h> |
4 | | #include <string.h> |
5 | | #include <sys/types.h> |
6 | | |
7 | | #include "../readstat.h" |
8 | | #include "../readstat_iconv.h" |
9 | | #include "../readstat_malloc.h" |
10 | | #include "../readstat_bits.h" |
11 | | |
12 | | #include "readstat_dta.h" |
13 | | |
14 | 7.58k | #define DTA_MIN_VERSION 104 |
15 | 3.78k | #define DTA_MAX_VERSION 119 |
16 | | |
17 | 4.25k | dta_ctx_t *dta_ctx_alloc(readstat_io_t *io) { |
18 | 4.25k | dta_ctx_t *ctx = calloc(1, sizeof(dta_ctx_t)); |
19 | 4.25k | if (ctx == NULL) { |
20 | 0 | return NULL; |
21 | 0 | } |
22 | | |
23 | 4.25k | ctx->io = io; |
24 | 4.25k | ctx->initialized = 0; |
25 | | |
26 | 4.25k | return ctx; |
27 | 4.25k | } |
28 | | |
29 | | readstat_error_t dta_ctx_init(dta_ctx_t *ctx, uint32_t nvar, uint64_t nobs, |
30 | | unsigned char byteorder, unsigned char ds_format, |
31 | 3.79k | const char *input_encoding, const char *output_encoding) { |
32 | 3.79k | readstat_error_t retval = READSTAT_OK; |
33 | 3.79k | int machine_byteorder = DTA_HILO; |
34 | 3.79k | if (ds_format < DTA_MIN_VERSION || ds_format > DTA_MAX_VERSION) |
35 | 14 | return READSTAT_ERROR_UNSUPPORTED_FILE_FORMAT_VERSION; |
36 | | |
37 | 3.77k | if (machine_is_little_endian()) { |
38 | 3.77k | machine_byteorder = DTA_LOHI; |
39 | 3.77k | } |
40 | | |
41 | 3.77k | ctx->bswap = (byteorder != machine_byteorder); |
42 | 3.77k | ctx->ds_format = ds_format; |
43 | 3.77k | ctx->endianness = byteorder == DTA_LOHI ? READSTAT_ENDIAN_LITTLE : READSTAT_ENDIAN_BIG; |
44 | | |
45 | 3.77k | ctx->nvar = nvar; |
46 | 3.77k | ctx->nobs = nobs; |
47 | | |
48 | 3.77k | if (ctx->nvar) { |
49 | 2.87k | if ((ctx->variables = readstat_calloc(ctx->nvar, sizeof(readstat_variable_t *))) == NULL) { |
50 | 87 | retval = READSTAT_ERROR_MALLOC; |
51 | 87 | goto cleanup; |
52 | 87 | } |
53 | 2.87k | } |
54 | | |
55 | 3.69k | ctx->machine_is_twos_complement = READSTAT_MACHINE_IS_TWOS_COMPLEMENT; |
56 | | |
57 | 3.69k | if (ds_format < 105) { |
58 | 688 | ctx->fmtlist_entry_len = 7; |
59 | 3.00k | } else if (ds_format < 114) { |
60 | 865 | ctx->fmtlist_entry_len = 12; |
61 | 2.13k | } else if (ds_format < 118) { |
62 | 1.74k | ctx->fmtlist_entry_len = 49; |
63 | 1.74k | } else { |
64 | 390 | ctx->fmtlist_entry_len = 57; |
65 | 390 | } |
66 | | |
67 | 3.69k | if (ds_format >= 117) { |
68 | 2.09k | ctx->typlist_version = 117; |
69 | 2.09k | } else if (ds_format >= 111) { |
70 | 320 | ctx->typlist_version = 111; |
71 | 1.28k | } else { |
72 | 1.28k | ctx->typlist_version = 0; |
73 | 1.28k | } |
74 | | |
75 | 3.69k | if (ds_format >= 118) { |
76 | 390 | ctx->data_label_len_len = 2; |
77 | 390 | ctx->strl_v_len = 2; |
78 | 390 | ctx->strl_o_len = 6; |
79 | 3.30k | } else if (ds_format >= 117) { |
80 | 1.70k | ctx->data_label_len_len = 1; |
81 | 1.70k | ctx->strl_v_len = 4; |
82 | 1.70k | ctx->strl_o_len = 4; |
83 | 1.70k | } |
84 | | |
85 | 3.69k | if (ds_format < 105) { |
86 | 688 | ctx->expansion_len_len = 0; |
87 | 3.00k | } else if (ds_format < 110) { |
88 | 592 | ctx->expansion_len_len = 2; |
89 | 2.41k | } else { |
90 | 2.41k | ctx->expansion_len_len = 4; |
91 | 2.41k | } |
92 | | |
93 | 3.69k | if (ds_format < 110) { |
94 | 1.28k | ctx->lbllist_entry_len = 9; |
95 | 1.28k | ctx->variable_name_len = 9; |
96 | 1.28k | ctx->ch_metadata_len = 9; |
97 | 2.41k | } else if (ds_format < 118) { |
98 | 2.02k | ctx->lbllist_entry_len = 33; |
99 | 2.02k | ctx->variable_name_len = 33; |
100 | 2.02k | ctx->ch_metadata_len = 33; |
101 | 2.02k | } else { |
102 | 390 | ctx->lbllist_entry_len = 129; |
103 | 390 | ctx->variable_name_len = 129; |
104 | 390 | ctx->ch_metadata_len = 129; |
105 | 390 | } |
106 | | |
107 | 3.69k | if (ds_format < 108) { |
108 | 1.27k | ctx->variable_labels_entry_len = 32; |
109 | 1.27k | ctx->data_label_len = 32; |
110 | 2.41k | } else if (ds_format < 118) { |
111 | 2.02k | ctx->variable_labels_entry_len = 81; |
112 | 2.02k | ctx->data_label_len = 81; |
113 | 2.02k | } else { |
114 | 390 | ctx->variable_labels_entry_len = 321; |
115 | 390 | ctx->data_label_len = 321; |
116 | 390 | } |
117 | | |
118 | 3.69k | if (ds_format < 105) { |
119 | 688 | ctx->timestamp_len = 0; |
120 | 688 | ctx->value_label_table_len_len = 2; |
121 | 688 | ctx->value_label_table_labname_len = 12; |
122 | 688 | ctx->value_label_table_padding_len = 2; |
123 | 3.00k | } else { |
124 | 3.00k | ctx->timestamp_len = 18; |
125 | 3.00k | ctx->value_label_table_len_len = 4; |
126 | 3.00k | if (ds_format < 118) { |
127 | 2.61k | ctx->value_label_table_labname_len = 33; |
128 | 2.61k | } else { |
129 | 390 | ctx->value_label_table_labname_len = 129; |
130 | 390 | } |
131 | 3.00k | ctx->value_label_table_padding_len = 3; |
132 | 3.00k | } |
133 | | |
134 | 3.69k | if (ds_format < 117) { |
135 | 1.60k | ctx->typlist_entry_len = 1; |
136 | 1.60k | ctx->file_is_xmlish = 0; |
137 | 2.09k | } else { |
138 | 2.09k | ctx->typlist_entry_len = 2; |
139 | 2.09k | ctx->file_is_xmlish = 1; |
140 | 2.09k | } |
141 | | |
142 | 3.69k | if (ds_format < 113) { |
143 | 1.30k | ctx->max_int8 = DTA_OLD_MAX_INT8; |
144 | 1.30k | ctx->max_int16 = DTA_OLD_MAX_INT16; |
145 | 1.30k | ctx->max_int32 = DTA_OLD_MAX_INT32; |
146 | 1.30k | ctx->max_float = DTA_OLD_MAX_FLOAT; |
147 | 1.30k | ctx->max_double = DTA_OLD_MAX_DOUBLE; |
148 | 2.39k | } else { |
149 | 2.39k | ctx->max_int8 = DTA_113_MAX_INT8; |
150 | 2.39k | ctx->max_int16 = DTA_113_MAX_INT16; |
151 | 2.39k | ctx->max_int32 = DTA_113_MAX_INT32; |
152 | 2.39k | ctx->max_float = DTA_113_MAX_FLOAT; |
153 | 2.39k | ctx->max_double = DTA_113_MAX_DOUBLE; |
154 | | |
155 | 2.39k | ctx->supports_tagged_missing = 1; |
156 | 2.39k | } |
157 | | |
158 | 3.69k | if (output_encoding) { |
159 | 3.69k | if (input_encoding) { |
160 | 0 | ctx->converter = iconv_open(output_encoding, input_encoding); |
161 | 3.69k | } else if (ds_format < 118) { |
162 | 3.30k | ctx->converter = iconv_open(output_encoding, "WINDOWS-1252"); |
163 | 3.30k | } else if (strcmp(output_encoding, "UTF-8") != 0) { |
164 | 0 | ctx->converter = iconv_open(output_encoding, "UTF-8"); |
165 | 0 | } |
166 | 3.69k | if (ctx->converter == (iconv_t)-1) { |
167 | 0 | ctx->converter = NULL; |
168 | 0 | retval = READSTAT_ERROR_UNSUPPORTED_CHARSET; |
169 | 0 | goto cleanup; |
170 | 0 | } |
171 | 3.69k | } |
172 | | |
173 | 3.69k | if (ds_format < 119) { |
174 | 3.61k | ctx->srtlist_len = (ctx->nvar + 1) * sizeof(int16_t); |
175 | 3.61k | } else { |
176 | 75 | ctx->srtlist_len = (ctx->nvar + 1) * sizeof(int32_t); |
177 | 75 | } |
178 | | |
179 | 3.69k | if ((ctx->srtlist = readstat_malloc(ctx->srtlist_len)) == NULL) { |
180 | 0 | retval = READSTAT_ERROR_MALLOC; |
181 | 0 | goto cleanup; |
182 | 0 | } |
183 | | |
184 | 3.69k | if (ctx->nvar > 0) { |
185 | 2.78k | ctx->typlist_len = ctx->nvar * sizeof(uint16_t); |
186 | 2.78k | ctx->varlist_len = ctx->variable_name_len * ctx->nvar * sizeof(char); |
187 | 2.78k | ctx->fmtlist_len = ctx->fmtlist_entry_len * ctx->nvar * sizeof(char); |
188 | 2.78k | ctx->lbllist_len = ctx->lbllist_entry_len * ctx->nvar * sizeof(char); |
189 | 2.78k | ctx->variable_labels_len = ctx->variable_labels_entry_len * ctx->nvar * sizeof(char); |
190 | | |
191 | 2.78k | if ((ctx->typlist = readstat_malloc(ctx->typlist_len)) == NULL) { |
192 | 0 | retval = READSTAT_ERROR_MALLOC; |
193 | 0 | goto cleanup; |
194 | 0 | } |
195 | 2.78k | if ((ctx->varlist = readstat_malloc(ctx->varlist_len)) == NULL) { |
196 | 23 | retval = READSTAT_ERROR_MALLOC; |
197 | 23 | goto cleanup; |
198 | 23 | } |
199 | 2.76k | if ((ctx->fmtlist = readstat_malloc(ctx->fmtlist_len)) == NULL) { |
200 | 0 | retval = READSTAT_ERROR_MALLOC; |
201 | 0 | goto cleanup; |
202 | 0 | } |
203 | 2.76k | if ((ctx->lbllist = readstat_malloc(ctx->lbllist_len)) == NULL) { |
204 | 0 | retval = READSTAT_ERROR_MALLOC; |
205 | 0 | goto cleanup; |
206 | 0 | } |
207 | 2.76k | if ((ctx->variable_labels = readstat_malloc(ctx->variable_labels_len)) == NULL) { |
208 | 3 | retval = READSTAT_ERROR_MALLOC; |
209 | 3 | goto cleanup; |
210 | 3 | } |
211 | 2.76k | } |
212 | | |
213 | 3.66k | ctx->initialized = 1; |
214 | | |
215 | 3.77k | cleanup: |
216 | 3.77k | return retval; |
217 | 3.66k | } |
218 | | |
219 | 4.25k | void dta_ctx_free(dta_ctx_t *ctx) { |
220 | 4.25k | if (ctx->typlist) |
221 | 2.78k | free(ctx->typlist); |
222 | 4.25k | if (ctx->varlist) |
223 | 2.76k | free(ctx->varlist); |
224 | 4.25k | if (ctx->srtlist) |
225 | 3.69k | free(ctx->srtlist); |
226 | 4.25k | if (ctx->fmtlist) |
227 | 2.76k | free(ctx->fmtlist); |
228 | 4.25k | if (ctx->lbllist) |
229 | 2.76k | free(ctx->lbllist); |
230 | 4.25k | if (ctx->variable_labels) |
231 | 2.76k | free(ctx->variable_labels); |
232 | 4.25k | if (ctx->converter) |
233 | 3.30k | iconv_close(ctx->converter); |
234 | 4.25k | if (ctx->data_label) |
235 | 3.41k | free(ctx->data_label); |
236 | 4.25k | if (ctx->variables) { |
237 | 2.78k | int i; |
238 | 52.2M | for (i=0; i<ctx->nvar; i++) { |
239 | 52.2M | if (ctx->variables[i]) |
240 | 37.8k | free(ctx->variables[i]); |
241 | 52.2M | } |
242 | 2.78k | free(ctx->variables); |
243 | 2.78k | } |
244 | 4.25k | if (ctx->strls) { |
245 | 804 | int i; |
246 | 34.5k | for (i=0; i<ctx->strls_count; i++) { |
247 | 33.7k | free(ctx->strls[i]); |
248 | 33.7k | } |
249 | 804 | free(ctx->strls); |
250 | 804 | } |
251 | 4.25k | free(ctx); |
252 | 4.25k | } |
253 | | |
254 | | readstat_error_t dta_type_info(uint16_t typecode, dta_ctx_t *ctx, |
255 | 146k | size_t *max_len, readstat_type_t *out_type) { |
256 | 146k | readstat_error_t retval = READSTAT_OK; |
257 | 146k | size_t len = 0; |
258 | 146k | readstat_type_t type = READSTAT_TYPE_STRING; |
259 | 146k | if (ctx->typlist_version == 111) { |
260 | 36.1k | switch (typecode) { |
261 | 877 | case DTA_111_TYPE_CODE_INT8: |
262 | 877 | len = 1; type = READSTAT_TYPE_INT8; break; |
263 | 434 | case DTA_111_TYPE_CODE_INT16: |
264 | 434 | len = 2; type = READSTAT_TYPE_INT16; break; |
265 | 440 | case DTA_111_TYPE_CODE_INT32: |
266 | 440 | len = 4; type = READSTAT_TYPE_INT32; break; |
267 | 487 | case DTA_111_TYPE_CODE_FLOAT: |
268 | 487 | len = 4; type = READSTAT_TYPE_FLOAT; break; |
269 | 711 | case DTA_111_TYPE_CODE_DOUBLE: |
270 | 711 | len = 8; type = READSTAT_TYPE_DOUBLE; break; |
271 | 33.2k | default: |
272 | 33.2k | len = typecode; type = READSTAT_TYPE_STRING; break; |
273 | 36.1k | } |
274 | 110k | } else if (ctx->typlist_version == 117) { |
275 | 61.8k | switch (typecode) { |
276 | 588 | case DTA_117_TYPE_CODE_INT8: |
277 | 588 | len = 1; type = READSTAT_TYPE_INT8; break; |
278 | 895 | case DTA_117_TYPE_CODE_INT16: |
279 | 895 | len = 2; type = READSTAT_TYPE_INT16; break; |
280 | 1.53k | case DTA_117_TYPE_CODE_INT32: |
281 | 1.53k | len = 4; type = READSTAT_TYPE_INT32; break; |
282 | 854 | case DTA_117_TYPE_CODE_FLOAT: |
283 | 854 | len = 4; type = READSTAT_TYPE_FLOAT; break; |
284 | 837 | case DTA_117_TYPE_CODE_DOUBLE: |
285 | 837 | len = 8; type = READSTAT_TYPE_DOUBLE; break; |
286 | 52.5k | case DTA_117_TYPE_CODE_STRL: |
287 | 52.5k | len = 8; type = READSTAT_TYPE_STRING_REF; break; |
288 | 4.62k | default: |
289 | 4.62k | len = typecode; type = READSTAT_TYPE_STRING; break; |
290 | 61.8k | } |
291 | 61.8k | } else if (typecode < 0x7F) { |
292 | 26.1k | switch (typecode) { |
293 | 23.3k | case DTA_OLD_TYPE_CODE_INT8: |
294 | 23.3k | len = 1; type = READSTAT_TYPE_INT8; break; |
295 | 1.07k | case DTA_OLD_TYPE_CODE_INT16: |
296 | 1.07k | len = 2; type = READSTAT_TYPE_INT16; break; |
297 | 517 | case DTA_OLD_TYPE_CODE_INT32: |
298 | 517 | len = 4; type = READSTAT_TYPE_INT32; break; |
299 | 640 | case DTA_OLD_TYPE_CODE_FLOAT: |
300 | 640 | len = 4; type = READSTAT_TYPE_FLOAT; break; |
301 | 549 | case DTA_OLD_TYPE_CODE_DOUBLE: |
302 | 549 | len = 8; type = READSTAT_TYPE_DOUBLE; break; |
303 | 15 | default: |
304 | 15 | retval = READSTAT_ERROR_PARSE; break; |
305 | 26.1k | } |
306 | 26.1k | } else { |
307 | 22.7k | len = typecode - 0x7F; |
308 | 22.7k | type = READSTAT_TYPE_STRING; |
309 | 22.7k | } |
310 | | |
311 | 146k | if (max_len) |
312 | 146k | *max_len = len; |
313 | 146k | if (out_type) |
314 | 109k | *out_type = type; |
315 | | |
316 | 146k | return retval; |
317 | 146k | } |