Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/stata.py: 14%

1"""

2Module contains tools for processing Stata files into DataFrames

4The StataReader below was originally written by Joe Presbrey as part of PyDTA.

5It has been extended and improved by Skipper Seabold from the Statsmodels

6project who also developed the StataWriter and was finally added to pandas in

7a once again improved version.

9You can find more information on http://presbrey.mit.edu/PyDTA and

10https://www.statsmodels.org/devel/

11"""

12from __future__ import annotations

14from collections import abc

15from datetime import (

16 datetime,

17 timedelta,

18)

19from io import BytesIO

20import os

21import struct

22import sys

23from typing import (

24 IO,

25 TYPE_CHECKING,

26 AnyStr,

27 Callable,

28 Final,

29 cast,

30)

31import warnings

33import numpy as np

35from pandas._libs import lib

36from pandas._libs.lib import infer_dtype

37from pandas._libs.writers import max_len_string_array

38from pandas.errors import (

39 CategoricalConversionWarning,

40 InvalidColumnName,

41 PossiblePrecisionLoss,

42 ValueLabelTypeMismatch,

43)

44from pandas.util._decorators import (

45 Appender,

46 doc,

47)

48from pandas.util._exceptions import find_stack_level

50from pandas.core.dtypes.base import ExtensionDtype

51from pandas.core.dtypes.common import (

52 ensure_object,

53 is_numeric_dtype,

54 is_string_dtype,

55)

56from pandas.core.dtypes.dtypes import CategoricalDtype

58from pandas import (

59 Categorical,

60 DatetimeIndex,

61 NaT,

62 Timestamp,

63 isna,

64 to_datetime,

65 to_timedelta,

66)

67from pandas.core.frame import DataFrame

68from pandas.core.indexes.base import Index

69from pandas.core.indexes.range import RangeIndex

70from pandas.core.series import Series

71from pandas.core.shared_docs import _shared_docs

73from pandas.io.common import get_handle

75if TYPE_CHECKING:

76 from collections.abc import (

77 Hashable,

78 Sequence,

79 )

80 from types import TracebackType

81 from typing import Literal

83 from pandas._typing import (

84 CompressionOptions,

85 FilePath,

86 ReadBuffer,

87 Self,

88 StorageOptions,

89 WriteBuffer,

90 )

92_version_error = (

93 "Version of given Stata file is {version}. pandas supports importing "

94 "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "

95 "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"

96 "and 119 (Stata 15/16, over 32,767 variables)."

97)

99_statafile_processing_params1 = """\

100convert_dates : bool, default True

101 Convert date variables to DataFrame time values.

102convert_categoricals : bool, default True

103 Read value labels and convert columns to Categorical/Factor variables."""

104

105_statafile_processing_params2 = """\

106index_col : str, optional

107 Column to set as index.

108convert_missing : bool, default False

109 Flag indicating whether to convert missing values to their Stata

110 representations. If False, missing values are replaced with nan.

111 If True, columns containing missing values are returned with

112 object data types and missing values are represented by

113 StataMissingValue objects.

114preserve_dtypes : bool, default True

115 Preserve Stata datatypes. If False, numeric data are upcast to pandas

116 default types for foreign data (float64 or int64).

117columns : list or None

118 Columns to retain. Columns will be returned in the given order. None

119 returns all columns.

120order_categoricals : bool, default True

121 Flag indicating whether converted categorical data are ordered."""

122

123_chunksize_params = """\

124chunksize : int, default None

125 Return StataReader object for iterations, returns chunks with

126 given number of lines."""

127

128_iterator_params = """\

129iterator : bool, default False

130 Return StataReader object."""

131

132_reader_notes = """\

133Notes

134-----

135Categorical variables read through an iterator may not have the same

136categories and dtype. This occurs when a variable stored in a DTA

137file is associated to an incomplete set of value labels that only

138label a strict subset of the values."""

139

140_read_stata_doc = f"""

141Read Stata file into DataFrame.

142

143Parameters

144----------

145filepath_or_buffer : str, path object or file-like object

146 Any valid string path is acceptable. The string could be a URL. Valid

147 URL schemes include http, ftp, s3, and file. For file URLs, a host is

148 expected. A local file could be: ``file://localhost/path/to/table.dta``.

149

150 If you want to pass in a path object, pandas accepts any ``os.PathLike``.

151

152 By file-like object, we refer to objects with a ``read()`` method,

153 such as a file handle (e.g. via builtin ``open`` function)

154 or ``StringIO``.

155{_statafile_processing_params1}

156{_statafile_processing_params2}

157{_chunksize_params}

158{_iterator_params}

159{_shared_docs["decompression_options"] % "filepath_or_buffer"}

160{_shared_docs["storage_options"]}

161

162Returns

163-------

164DataFrame or pandas.api.typing.StataReader

165

166See Also

167--------

168io.stata.StataReader : Low-level reader for Stata data files.

169DataFrame.to_stata: Export Stata data files.

170

171{_reader_notes}

172

173Examples

174--------

175

176Creating a dummy stata for this example

177

178>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],

179... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP

180>>> df.to_stata('animals.dta') # doctest: +SKIP

181

182Read a Stata dta file:

183

184>>> df = pd.read_stata('animals.dta') # doctest: +SKIP

185

186Read a Stata dta file in 10,000 line chunks:

187

188>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP

189>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP

190>>> df.to_stata('filename.dta') # doctest: +SKIP

191

192>>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP

193>>> for chunk in itr:

194... # Operate on a single chunk, e.g., chunk.mean()

195... pass # doctest: +SKIP

196"""

197

198_read_method_doc = f"""\

199Reads observations from Stata file, converting them into a dataframe

200

201Parameters

202----------

203nrows : int

204 Number of lines to read from data file, if None read whole file.

205{_statafile_processing_params1}

206{_statafile_processing_params2}

207

208Returns

209-------

210DataFrame

211"""

212

213_stata_reader_doc = f"""\

214Class for reading Stata dta files.

215

216Parameters

217----------

218path_or_buf : path (string), buffer or path object

219 string, path object (pathlib.Path or py._path.local.LocalPath) or object

220 implementing a binary read() functions.

221{_statafile_processing_params1}

222{_statafile_processing_params2}

223{_chunksize_params}

224{_shared_docs["decompression_options"]}

225{_shared_docs["storage_options"]}

226

227{_reader_notes}

228"""

229

230

231_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]

232

233

234stata_epoch: Final = datetime(1960, 1, 1)

235

236

237def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:

238 """

239 Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime

240

241 Parameters

242 ----------

243 dates : Series

244 The Stata Internal Format date to convert to datetime according to fmt

245 fmt : str

246 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty

247 Returns

248

249 Returns

250 -------

251 converted : Series

252 The converted dates

253

254 Examples

255 --------

256 >>> dates = pd.Series([52])

257 >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")

258 0 1961-01-01

259 dtype: datetime64[ns]

260

261 Notes

262 -----

263 datetime/c - tc

264 milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day

265 datetime/C - tC - NOT IMPLEMENTED

266 milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds

267 date - td

268 days since 01jan1960 (01jan1960 = 0)

269 weekly date - tw

270 weeks since 1960w1

271 This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.

272 The datetime value is the start of the week in terms of days in the

273 year, not ISO calendar weeks.

274 monthly date - tm

275 months since 1960m1

276 quarterly date - tq

277 quarters since 1960q1

278 half-yearly date - th

279 half-years since 1960h1 yearly

280 date - ty

281 years since 0000

282 """

283 MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year

284 MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days

285 MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days

286 MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000

287 MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000

288

289 def convert_year_month_safe(year, month) -> Series:

290 """

291 Convert year and month to datetimes, using pandas vectorized versions

292 when the date range falls within the range supported by pandas.

293 Otherwise it falls back to a slower but more robust method

294 using datetime.

295 """

296 if year.max() < MAX_YEAR and year.min() > MIN_YEAR:

297 return to_datetime(100 * year + month, format="%Y%m")

298 else:

299 index = getattr(year, "index", None)

300 return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index)

301

302 def convert_year_days_safe(year, days) -> Series:

303 """

304 Converts year (e.g. 1999) and days since the start of the year to a

305 datetime or datetime64 Series

306 """

307 if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:

308 return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")

309 else:

310 index = getattr(year, "index", None)

311 value = [

312 datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days)

313 ]

314 return Series(value, index=index)

315

316 def convert_delta_safe(base, deltas, unit) -> Series:

317 """

318 Convert base dates and deltas to datetimes, using pandas vectorized

319 versions if the deltas satisfy restrictions required to be expressed

320 as dates in pandas.

321 """

322 index = getattr(deltas, "index", None)

323 if unit == "d":

324 if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:

325 values = [base + timedelta(days=int(d)) for d in deltas]

326 return Series(values, index=index)

327 elif unit == "ms":

328 if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:

329 values = [

330 base + timedelta(microseconds=(int(d) * 1000)) for d in deltas

331 ]

332 return Series(values, index=index)

333 else:

334 raise ValueError("format not understood")

335 base = to_datetime(base)

336 deltas = to_timedelta(deltas, unit=unit)

337 return base + deltas

338

339 # TODO(non-nano): If/when pandas supports more than datetime64[ns], this

340 # should be improved to use correct range, e.g. datetime[Y] for yearly

341 bad_locs = np.isnan(dates)

342 has_bad_values = False

343 if bad_locs.any():

344 has_bad_values = True

345 dates._values[bad_locs] = 1.0 # Replace with NaT

346 dates = dates.astype(np.int64)

347

348 if fmt.startswith(("%tc", "tc")): # Delta ms relative to base

349 base = stata_epoch

350 ms = dates

351 conv_dates = convert_delta_safe(base, ms, "ms")

352 elif fmt.startswith(("%tC", "tC")):

353 warnings.warn(

354 "Encountered %tC format. Leaving in Stata Internal Format.",

355 stacklevel=find_stack_level(),

356 )

357 conv_dates = Series(dates, dtype=object)

358 if has_bad_values:

359 conv_dates[bad_locs] = NaT

360 return conv_dates

361 # Delta days relative to base

362 elif fmt.startswith(("%td", "td", "%d", "d")):

363 base = stata_epoch

364 days = dates

365 conv_dates = convert_delta_safe(base, days, "d")

366 # does not count leap days - 7 days is a week.

367 # 52nd week may have more than 7 days

368 elif fmt.startswith(("%tw", "tw")):

369 year = stata_epoch.year + dates // 52

370 days = (dates % 52) * 7

371 conv_dates = convert_year_days_safe(year, days)

372 elif fmt.startswith(("%tm", "tm")): # Delta months relative to base

373 year = stata_epoch.year + dates // 12

374 month = (dates % 12) + 1

375 conv_dates = convert_year_month_safe(year, month)

376 elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base

377 year = stata_epoch.year + dates // 4

378 quarter_month = (dates % 4) * 3 + 1

379 conv_dates = convert_year_month_safe(year, quarter_month)

380 elif fmt.startswith(("%th", "th")): # Delta half-years relative to base

381 year = stata_epoch.year + dates // 2

382 month = (dates % 2) * 6 + 1

383 conv_dates = convert_year_month_safe(year, month)

384 elif fmt.startswith(("%ty", "ty")): # Years -- not delta

385 year = dates

386 first_month = np.ones_like(dates)

387 conv_dates = convert_year_month_safe(year, first_month)

388 else:

389 raise ValueError(f"Date fmt {fmt} not understood")

390

391 if has_bad_values: # Restore NaT for bad values

392 conv_dates[bad_locs] = NaT

393

394 return conv_dates

395

396

397def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:

398 """

399 Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime

400

401 Parameters

402 ----------

403 dates : Series

404 Series or array containing datetime or datetime64[ns] to

405 convert to the Stata Internal Format given by fmt

406 fmt : str

407 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty

408 """

409 index = dates.index

410 NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000

411 US_PER_DAY = NS_PER_DAY / 1000

412

413 def parse_dates_safe(

414 dates: Series, delta: bool = False, year: bool = False, days: bool = False

415 ):

416 d = {}

417 if lib.is_np_dtype(dates.dtype, "M"):

418 if delta:

419 time_delta = dates - Timestamp(stata_epoch).as_unit("ns")

420 d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds

421 if days or year:

422 date_index = DatetimeIndex(dates)

423 d["year"] = date_index._data.year

424 d["month"] = date_index._data.month

425 if days:

426 days_in_ns = dates._values.view(np.int64) - to_datetime(

427 d["year"], format="%Y"

428 )._values.view(np.int64)

429 d["days"] = days_in_ns // NS_PER_DAY

430

431 elif infer_dtype(dates, skipna=False) == "datetime":

432 if delta:

433 delta = dates._values - stata_epoch

434

435 def f(x: timedelta) -> float:

436 return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds

437

438 v = np.vectorize(f)

439 d["delta"] = v(delta)

440 if year:

441 year_month = dates.apply(lambda x: 100 * x.year + x.month)

442 d["year"] = year_month._values // 100

443 d["month"] = year_month._values - d["year"] * 100

444 if days:

445

446 def g(x: datetime) -> int:

447 return (x - datetime(x.year, 1, 1)).days

448

449 v = np.vectorize(g)

450 d["days"] = v(dates)

451 else:

452 raise ValueError(

453 "Columns containing dates must contain either "

454 "datetime64, datetime or null values."

455 )

456

457 return DataFrame(d, index=index)

458

459 bad_loc = isna(dates)

460 index = dates.index

461 if bad_loc.any():

462 if lib.is_np_dtype(dates.dtype, "M"):

463 dates._values[bad_loc] = to_datetime(stata_epoch)

464 else:

465 dates._values[bad_loc] = stata_epoch

466

467 if fmt in ["%tc", "tc"]:

468 d = parse_dates_safe(dates, delta=True)

469 conv_dates = d.delta / 1000

470 elif fmt in ["%tC", "tC"]:

471 warnings.warn(

472 "Stata Internal Format tC not supported.",

473 stacklevel=find_stack_level(),

474 )

475 conv_dates = dates

476 elif fmt in ["%td", "td"]:

477 d = parse_dates_safe(dates, delta=True)

478 conv_dates = d.delta // US_PER_DAY

479 elif fmt in ["%tw", "tw"]:

480 d = parse_dates_safe(dates, year=True, days=True)

481 conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7

482 elif fmt in ["%tm", "tm"]:

483 d = parse_dates_safe(dates, year=True)

484 conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1

485 elif fmt in ["%tq", "tq"]:

486 d = parse_dates_safe(dates, year=True)

487 conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3

488 elif fmt in ["%th", "th"]:

489 d = parse_dates_safe(dates, year=True)

490 conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int)

491 elif fmt in ["%ty", "ty"]:

492 d = parse_dates_safe(dates, year=True)

493 conv_dates = d.year

494 else:

495 raise ValueError(f"Format {fmt} is not a known Stata date format")

496

497 conv_dates = Series(conv_dates, dtype=np.float64, copy=False)

498 missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]

499 conv_dates[bad_loc] = missing_value

500

501 return Series(conv_dates, index=index, copy=False)

502

503

504excessive_string_length_error: Final = """

505Fixed width strings in Stata .dta files are limited to 244 (or fewer)

506characters. Column '{0}' does not satisfy this restriction. Use the

507'version=117' parameter to write the newer (Stata 13 and later) format.

508"""

509

510

511precision_loss_doc: Final = """

512Column converted from {0} to {1}, and some data are outside of the lossless

513conversion range. This may result in a loss of precision in the saved data.

514"""

515

516

517value_label_mismatch_doc: Final = """

518Stata value labels (pandas categories) must be strings. Column {0} contains

519non-string labels which will be converted to strings. Please check that the

520Stata data file created has not lost information due to duplicate labels.

521"""

522

523

524invalid_name_doc: Final = """

525Not all pandas column names were valid Stata variable names.

526The following replacements have been made:

527

528 {0}

529

530If this is not what you expect, please make sure you have Stata-compliant

531column names in your DataFrame (strings only, max 32 characters, only

532alphanumerics and underscores, no Stata reserved words)

533"""

534

535

536categorical_conversion_warning: Final = """

537One or more series with value labels are not fully labeled. Reading this

538dataset with an iterator results in categorical variable with different

539categories. This occurs since it is not possible to know all possible values

540until the entire dataset has been read. To avoid this warning, you can either

541read dataset without an iterator, or manually convert categorical data by

542``convert_categoricals`` to False and then accessing the variable labels

543through the value_labels method of the reader.

544"""

545

546

547def _cast_to_stata_types(data: DataFrame) -> DataFrame:

548 """

549 Checks the dtypes of the columns of a pandas DataFrame for

550 compatibility with the data types and ranges supported by Stata, and

551 converts if necessary.

552

553 Parameters

554 ----------

555 data : DataFrame

556 The DataFrame to check and convert

557

558 Notes

559 -----

560 Numeric columns in Stata must be one of int8, int16, int32, float32 or

561 float64, with some additional value restrictions. int8 and int16 columns

562 are checked for violations of the value restrictions and upcast if needed.

563 int64 data is not usable in Stata, and so it is downcast to int32 whenever

564 the value are in the int32 range, and sidecast to float64 when larger than

565 this range. If the int64 values are outside of the range of those

566 perfectly representable as float64 values, a warning is raised.

567

568 bool columns are cast to int8. uint columns are converted to int of the

569 same size if there is no loss in precision, otherwise are upcast to a

570 larger type. uint64 is currently not supported since it is concerted to

571 object in a DataFrame.

572 """

573 ws = ""

574 # original, if small, if large

575 conversion_data: tuple[

576 tuple[type, type, type],

577 tuple[type, type, type],

578 tuple[type, type, type],

579 tuple[type, type, type],

580 tuple[type, type, type],

581 ] = (

582 (np.bool_, np.int8, np.int8),

583 (np.uint8, np.int8, np.int16),

584 (np.uint16, np.int16, np.int32),

585 (np.uint32, np.int32, np.int64),

586 (np.uint64, np.int64, np.float64),

587 )

588

589 float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]

590 float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]

591

592 for col in data:

593 # Cast from unsupported types to supported types

594 is_nullable_int = (

595 isinstance(data[col].dtype, ExtensionDtype)

596 and data[col].dtype.kind in "iub"

597 )

598 # We need to find orig_missing before altering data below

599 orig_missing = data[col].isna()

600 if is_nullable_int:

601 fv = 0 if data[col].dtype.kind in "iu" else False

602 # Replace with NumPy-compatible column

603 data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype)

604 elif isinstance(data[col].dtype, ExtensionDtype):

605 if getattr(data[col].dtype, "numpy_dtype", None) is not None:

606 data[col] = data[col].astype(data[col].dtype.numpy_dtype)

607 elif is_string_dtype(data[col].dtype):

608 data[col] = data[col].astype("object")

609

610 dtype = data[col].dtype

611 empty_df = data.shape[0] == 0

612 for c_data in conversion_data:

613 if dtype == c_data[0]:

614 if empty_df or data[col].max() <= np.iinfo(c_data[1]).max:

615 dtype = c_data[1]

616 else:

617 dtype = c_data[2]

618 if c_data[2] == np.int64: # Warn if necessary

619 if data[col].max() >= 2**53:

620 ws = precision_loss_doc.format("uint64", "float64")

621

622 data[col] = data[col].astype(dtype)

623

624 # Check values and upcast if necessary

625

626 if dtype == np.int8 and not empty_df:

627 if data[col].max() > 100 or data[col].min() < -127:

628 data[col] = data[col].astype(np.int16)

629 elif dtype == np.int16 and not empty_df:

630 if data[col].max() > 32740 or data[col].min() < -32767:

631 data[col] = data[col].astype(np.int32)

632 elif dtype == np.int64:

633 if empty_df or (

634 data[col].max() <= 2147483620 and data[col].min() >= -2147483647

635 ):

636 data[col] = data[col].astype(np.int32)

637 else:

638 data[col] = data[col].astype(np.float64)

639 if data[col].max() >= 2**53 or data[col].min() <= -(2**53):

640 ws = precision_loss_doc.format("int64", "float64")

641 elif dtype in (np.float32, np.float64):

642 if np.isinf(data[col]).any():

643 raise ValueError(

644 f"Column {col} contains infinity or -infinity"

645 "which is outside the range supported by Stata."

646 )

647 value = data[col].max()

648 if dtype == np.float32 and value > float32_max:

649 data[col] = data[col].astype(np.float64)

650 elif dtype == np.float64:

651 if value > float64_max:

652 raise ValueError(

653 f"Column {col} has a maximum value ({value}) outside the range "

654 f"supported by Stata ({float64_max})"

655 )

656 if is_nullable_int:

657 if orig_missing.any():

658 # Replace missing by Stata sentinel value

659 sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]

660 data.loc[orig_missing, col] = sentinel

661 if ws:

662 warnings.warn(

663 ws,

664 PossiblePrecisionLoss,

665 stacklevel=find_stack_level(),

666 )

667

668 return data

669

670

671class StataValueLabel:

672 """

673 Parse a categorical column and prepare formatted output

674

675 Parameters

676 ----------

677 catarray : Series

678 Categorical Series to encode

679 encoding : {"latin-1", "utf-8"}

680 Encoding to use for value labels.

681 """

682

683 def __init__(

684 self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1"

685 ) -> None:

686 if encoding not in ("latin-1", "utf-8"):

687 raise ValueError("Only latin-1 and utf-8 are supported.")

688 self.labname = catarray.name

689 self._encoding = encoding

690 categories = catarray.cat.categories

691 self.value_labels = enumerate(categories)

692

693 self._prepare_value_labels()

694

695 def _prepare_value_labels(self) -> None:

696 """Encode value labels."""

697

698 self.text_len = 0

699 self.txt: list[bytes] = []

700 self.n = 0

701 # Offsets (length of categories), converted to int32

702 self.off = np.array([], dtype=np.int32)

703 # Values, converted to int32

704 self.val = np.array([], dtype=np.int32)

705 self.len = 0

706

707 # Compute lengths and setup lists of offsets and labels

708 offsets: list[int] = []

709 values: list[float] = []

710 for vl in self.value_labels:

711 category: str | bytes = vl[1]

712 if not isinstance(category, str):

713 category = str(category)

714 warnings.warn(

715 value_label_mismatch_doc.format(self.labname),

716 ValueLabelTypeMismatch,

717 stacklevel=find_stack_level(),

718 )

719 category = category.encode(self._encoding)

720 offsets.append(self.text_len)

721 self.text_len += len(category) + 1 # +1 for the padding

722 values.append(vl[0])

723 self.txt.append(category)

724 self.n += 1

725

726 if self.text_len > 32000:

727 raise ValueError(

728 "Stata value labels for a single variable must "

729 "have a combined length less than 32,000 characters."

730 )

731

732 # Ensure int32

733 self.off = np.array(offsets, dtype=np.int32)

734 self.val = np.array(values, dtype=np.int32)

735

736 # Total length

737 self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len

738

739 def generate_value_label(self, byteorder: str) -> bytes:

740 """

741 Generate the binary representation of the value labels.

742

743 Parameters

744 ----------

745 byteorder : str

746 Byte order of the output

747

748 Returns

749 -------

750 value_label : bytes

751 Bytes containing the formatted value label

752 """

753 encoding = self._encoding

754 bio = BytesIO()

755 null_byte = b"\x00"

756

757 # len

758 bio.write(struct.pack(byteorder + "i", self.len))

759

760 # labname

761 labname = str(self.labname)[:32].encode(encoding)

762 lab_len = 32 if encoding not in ("utf-8", "utf8") else 128

763 labname = _pad_bytes(labname, lab_len + 1)

764 bio.write(labname)

765

766 # padding - 3 bytes

767 for i in range(3):

768 bio.write(struct.pack("c", null_byte))

769

770 # value_label_table

771 # n - int32

772 bio.write(struct.pack(byteorder + "i", self.n))

773

774 # textlen - int32

775 bio.write(struct.pack(byteorder + "i", self.text_len))

776

777 # off - int32 array (n elements)

778 for offset in self.off:

779 bio.write(struct.pack(byteorder + "i", offset))

780

781 # val - int32 array (n elements)

782 for value in self.val:

783 bio.write(struct.pack(byteorder + "i", value))

784

785 # txt - Text labels, null terminated

786 for text in self.txt:

787 bio.write(text + null_byte)

788

789 return bio.getvalue()

790

791

792class StataNonCatValueLabel(StataValueLabel):

793 """

794 Prepare formatted version of value labels

795

796 Parameters

797 ----------

798 labname : str

799 Value label name

800 value_labels: Dictionary

801 Mapping of values to labels

802 encoding : {"latin-1", "utf-8"}

803 Encoding to use for value labels.

804 """

805

806 def __init__(

807 self,

808 labname: str,

809 value_labels: dict[float, str],

810 encoding: Literal["latin-1", "utf-8"] = "latin-1",

811 ) -> None:

812 if encoding not in ("latin-1", "utf-8"):

813 raise ValueError("Only latin-1 and utf-8 are supported.")

814

815 self.labname = labname

816 self._encoding = encoding

817 self.value_labels = sorted( # type: ignore[assignment]

818 value_labels.items(), key=lambda x: x[0]

819 )

820 self._prepare_value_labels()

821

822

823class StataMissingValue:

824 """

825 An observation's missing value.

826

827 Parameters

828 ----------

829 value : {int, float}

830 The Stata missing value code

831

832 Notes

833 -----

834 More information: <https://www.stata.com/help.cgi?missing>

835

836 Integer missing values make the code '.', '.a', ..., '.z' to the ranges

837 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...

838 2147483647 (for int32). Missing values for floating point data types are

839 more complex but the pattern is simple to discern from the following table.

840

841 np.float32 missing values (float in Stata)

842 0000007f .

843 0008007f .a

844 0010007f .b

845 ...

846 00c0007f .x

847 00c8007f .y

848 00d0007f .z

849

850 np.float64 missing values (double in Stata)

851 000000000000e07f .

852 000000000001e07f .a

853 000000000002e07f .b

854 ...

855 000000000018e07f .x

856 000000000019e07f .y

857 00000000001ae07f .z

858 """

859

860 # Construct a dictionary of missing values

861 MISSING_VALUES: dict[float, str] = {}

862 bases: Final = (101, 32741, 2147483621)

863 for b in bases:

864 # Conversion to long to avoid hash issues on 32 bit platforms #8968

865 MISSING_VALUES[b] = "."

866 for i in range(1, 27):

867 MISSING_VALUES[i + b] = "." + chr(96 + i)

868

869 float32_base: bytes = b"\x00\x00\x00\x7f"

870 increment_32: int = struct.unpack("<i", b"\x00\x08\x00\x00")[0]

871 for i in range(27):

872 key = struct.unpack("<f", float32_base)[0]

873 MISSING_VALUES[key] = "."

874 if i > 0:

875 MISSING_VALUES[key] += chr(96 + i)

876 int_value = struct.unpack("<i", struct.pack("<f", key))[0] + increment_32

877 float32_base = struct.pack("<i", int_value)

878

879 float64_base: bytes = b"\x00\x00\x00\x00\x00\x00\xe0\x7f"

880 increment_64 = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0]

881 for i in range(27):

882 key = struct.unpack("<d", float64_base)[0]

883 MISSING_VALUES[key] = "."

884 if i > 0:

885 MISSING_VALUES[key] += chr(96 + i)

886 int_value = struct.unpack("q", struct.pack("<d", key))[0] + increment_64

887 float64_base = struct.pack("q", int_value)

888

889 BASE_MISSING_VALUES: Final = {

890 "int8": 101,

891 "int16": 32741,

892 "int32": 2147483621,

893 "float32": struct.unpack("<f", float32_base)[0],

894 "float64": struct.unpack("<d", float64_base)[0],

895 }

896

897 def __init__(self, value: float) -> None:

898 self._value = value

899 # Conversion to int to avoid hash issues on 32 bit platforms #8968

900 value = int(value) if value < 2147483648 else float(value)

901 self._str = self.MISSING_VALUES[value]

902

903 @property

904 def string(self) -> str:

905 """

906 The Stata representation of the missing value: '.', '.a'..'.z'

907

908 Returns

909 -------

910 str

911 The representation of the missing value.

912 """

913 return self._str

914

915 @property

916 def value(self) -> float:

917 """

918 The binary representation of the missing value.

919

920 Returns

921 -------

922 {int, float}

923 The binary representation of the missing value.

924 """

925 return self._value

926

927 def __str__(self) -> str:

928 return self.string

929

930 def __repr__(self) -> str:

931 return f"{type(self)}({self})"

932

933 def __eq__(self, other: object) -> bool:

934 return (

935 isinstance(other, type(self))

936 and self.string == other.string

937 and self.value == other.value

938 )

939

940 @classmethod

941 def get_base_missing_value(cls, dtype: np.dtype) -> float:

942 if dtype.type is np.int8:

943 value = cls.BASE_MISSING_VALUES["int8"]

944 elif dtype.type is np.int16:

945 value = cls.BASE_MISSING_VALUES["int16"]

946 elif dtype.type is np.int32:

947 value = cls.BASE_MISSING_VALUES["int32"]

948 elif dtype.type is np.float32:

949 value = cls.BASE_MISSING_VALUES["float32"]

950 elif dtype.type is np.float64:

951 value = cls.BASE_MISSING_VALUES["float64"]

952 else:

953 raise ValueError("Unsupported dtype")

954 return value

955

956

957class StataParser:

958 def __init__(self) -> None:

959 # type code.

960 # --------------------

961 # str1 1 = 0x01

962 # str2 2 = 0x02

963 # ...

964 # str244 244 = 0xf4

965 # byte 251 = 0xfb (sic)

966 # int 252 = 0xfc

967 # long 253 = 0xfd

968 # float 254 = 0xfe

969 # double 255 = 0xff

970 # --------------------

971 # NOTE: the byte type seems to be reserved for categorical variables

972 # with a label, but the underlying variable is -127 to 100

973 # we're going to drop the label and cast to int

974 self.DTYPE_MAP = dict(

975 [(i, np.dtype(f"S{i}")) for i in range(1, 245)]

976 + [

977 (251, np.dtype(np.int8)),

978 (252, np.dtype(np.int16)),

979 (253, np.dtype(np.int32)),

980 (254, np.dtype(np.float32)),

981 (255, np.dtype(np.float64)),

982 ]

983 )

984 self.DTYPE_MAP_XML: dict[int, np.dtype] = {

985 32768: np.dtype(np.uint8), # Keys to GSO

986 65526: np.dtype(np.float64),

987 65527: np.dtype(np.float32),

988 65528: np.dtype(np.int32),

989 65529: np.dtype(np.int16),

990 65530: np.dtype(np.int8),

991 }

992 self.TYPE_MAP = list(tuple(range(251)) + tuple("bhlfd"))

993 self.TYPE_MAP_XML = {

994 # Not really a Q, unclear how to handle byteswap

995 32768: "Q",

996 65526: "d",

997 65527: "f",

998 65528: "l",

999 65529: "h",

1000 65530: "b",

1001 }

1002 # NOTE: technically, some of these are wrong. there are more numbers

1003 # that can be represented. it's the 27 ABOVE and BELOW the max listed

1004 # numeric data type in [U] 12.2.2 of the 11.2 manual

1005 float32_min = b"\xff\xff\xff\xfe"

1006 float32_max = b"\xff\xff\xff\x7e"

1007 float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff"

1008 float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f"

1009 self.VALID_RANGE = {

1010 "b": (-127, 100),

1011 "h": (-32767, 32740),

1012 "l": (-2147483647, 2147483620),

1013 "f": (

1014 np.float32(struct.unpack("<f", float32_min)[0]),

1015 np.float32(struct.unpack("<f", float32_max)[0]),

1016 ),

1017 "d": (

1018 np.float64(struct.unpack("<d", float64_min)[0]),

1019 np.float64(struct.unpack("<d", float64_max)[0]),

1020 ),

1021 }

1022

1023 self.OLD_TYPE_MAPPING = {

1024 98: 251, # byte

1025 105: 252, # int

1026 108: 253, # long

1027 102: 254, # float

1028 100: 255, # double

1029 }

1030

1031 # These missing values are the generic '.' in Stata, and are used

1032 # to replace nans

1033 self.MISSING_VALUES = {

1034 "b": 101,

1035 "h": 32741,

1036 "l": 2147483621,

1037 "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]),

1038 "d": np.float64(

1039 struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]

1040 ),

1041 }

1042 self.NUMPY_TYPE_MAP = {

1043 "b": "i1",

1044 "h": "i2",

1045 "l": "i4",

1046 "f": "f4",

1047 "d": "f8",

1048 "Q": "u8",

1049 }

1050

1051 # Reserved words cannot be used as variable names

1052 self.RESERVED_WORDS = {

1053 "aggregate",

1054 "array",

1055 "boolean",

1056 "break",

1057 "byte",

1058 "case",

1059 "catch",

1060 "class",

1061 "colvector",

1062 "complex",

1063 "const",

1064 "continue",

1065 "default",

1066 "delegate",

1067 "delete",

1068 "do",

1069 "double",

1070 "else",

1071 "eltypedef",

1072 "end",

1073 "enum",

1074 "explicit",

1075 "export",

1076 "external",

1077 "float",

1078 "for",

1079 "friend",

1080 "function",

1081 "global",

1082 "goto",

1083 "if",

1084 "inline",

1085 "int",

1086 "local",

1087 "long",

1088 "NULL",

1089 "pragma",

1090 "protected",

1091 "quad",

1092 "rowvector",

1093 "short",

1094 "typedef",

1095 "typename",

1096 "virtual",

1097 "_all",

1098 "_N",

1099 "_skip",

1100 "_b",

1101 "_pi",

1102 "str#",

1103 "in",

1104 "_pred",

1105 "strL",

1106 "_coef",

1107 "_rc",

1108 "using",

1109 "_cons",

1110 "_se",

1111 "with",

1112 "_n",

1113 }

1114

1115

1116class StataReader(StataParser, abc.Iterator):

1117 __doc__ = _stata_reader_doc

1118

1119 _path_or_buf: IO[bytes] # Will be assigned by `_open_file`.

1120

1121 def __init__(

1122 self,

1123 path_or_buf: FilePath | ReadBuffer[bytes],

1124 convert_dates: bool = True,

1125 convert_categoricals: bool = True,

1126 index_col: str | None = None,

1127 convert_missing: bool = False,

1128 preserve_dtypes: bool = True,

1129 columns: Sequence[str] | None = None,

1130 order_categoricals: bool = True,

1131 chunksize: int | None = None,

1132 compression: CompressionOptions = "infer",

1133 storage_options: StorageOptions | None = None,

1134 ) -> None:

1135 super().__init__()

1136

1137 # Arguments to the reader (can be temporarily overridden in

1138 # calls to read).

1139 self._convert_dates = convert_dates

1140 self._convert_categoricals = convert_categoricals

1141 self._index_col = index_col

1142 self._convert_missing = convert_missing

1143 self._preserve_dtypes = preserve_dtypes

1144 self._columns = columns

1145 self._order_categoricals = order_categoricals

1146 self._original_path_or_buf = path_or_buf

1147 self._compression = compression

1148 self._storage_options = storage_options

1149 self._encoding = ""

1150 self._chunksize = chunksize

1151 self._using_iterator = False

1152 self._entered = False

1153 if self._chunksize is None:

1154 self._chunksize = 1

1155 elif not isinstance(chunksize, int) or chunksize <= 0:

1156 raise ValueError("chunksize must be a positive integer when set.")

1157

1158 # State variables for the file

1159 self._close_file: Callable[[], None] | None = None

1160 self._missing_values = False

1161 self._can_read_value_labels = False

1162 self._column_selector_set = False

1163 self._value_labels_read = False

1164 self._data_read = False

1165 self._dtype: np.dtype | None = None

1166 self._lines_read = 0

1167

1168 self._native_byteorder = _set_endianness(sys.byteorder)

1169

1170 def _ensure_open(self) -> None:

1171 """

1172 Ensure the file has been opened and its header data read.

1173 """

1174 if not hasattr(self, "_path_or_buf"):

1175 self._open_file()

1176

1177 def _open_file(self) -> None:

1178 """

1179 Open the file (with compression options, etc.), and read header information.

1180 """

1181 if not self._entered:

1182 warnings.warn(

1183 "StataReader is being used without using a context manager. "

1184 "Using StataReader as a context manager is the only supported method.",

1185 ResourceWarning,

1186 stacklevel=find_stack_level(),

1187 )

1188 handles = get_handle(

1189 self._original_path_or_buf,

1190 "rb",

1191 storage_options=self._storage_options,

1192 is_text=False,

1193 compression=self._compression,

1194 )

1195 if hasattr(handles.handle, "seekable") and handles.handle.seekable():

1196 # If the handle is directly seekable, use it without an extra copy.

1197 self._path_or_buf = handles.handle

1198 self._close_file = handles.close

1199 else:

1200 # Copy to memory, and ensure no encoding.

1201 with handles:

1202 self._path_or_buf = BytesIO(handles.handle.read())

1203 self._close_file = self._path_or_buf.close

1204

1205 self._read_header()

1206 self._setup_dtype()

1207

1208 def __enter__(self) -> Self:

1209 """enter context manager"""

1210 self._entered = True

1211 return self

1212

1213 def __exit__(

1214 self,

1215 exc_type: type[BaseException] | None,

1216 exc_value: BaseException | None,

1217 traceback: TracebackType | None,

1218 ) -> None:

1219 if self._close_file:

1220 self._close_file()

1221

1222 def close(self) -> None:

1223 """Close the handle if its open.

1224

1225 .. deprecated: 2.0.0

1226

1227 The close method is not part of the public API.

1228 The only supported way to use StataReader is to use it as a context manager.

1229 """

1230 warnings.warn(

1231 "The StataReader.close() method is not part of the public API and "

1232 "will be removed in a future version without notice. "

1233 "Using StataReader as a context manager is the only supported method.",

1234 FutureWarning,

1235 stacklevel=find_stack_level(),

1236 )

1237 if self._close_file:

1238 self._close_file()

1239

1240 def _set_encoding(self) -> None:

1241 """

1242 Set string encoding which depends on file version

1243 """

1244 if self._format_version < 118:

1245 self._encoding = "latin-1"

1246 else:

1247 self._encoding = "utf-8"

1248

1249 def _read_int8(self) -> int:

1250 return struct.unpack("b", self._path_or_buf.read(1))[0]

1251

1252 def _read_uint8(self) -> int:

1253 return struct.unpack("B", self._path_or_buf.read(1))[0]

1254

1255 def _read_uint16(self) -> int:

1256 return struct.unpack(f"{self._byteorder}H", self._path_or_buf.read(2))[0]

1257

1258 def _read_uint32(self) -> int:

1259 return struct.unpack(f"{self._byteorder}I", self._path_or_buf.read(4))[0]

1260

1261 def _read_uint64(self) -> int:

1262 return struct.unpack(f"{self._byteorder}Q", self._path_or_buf.read(8))[0]

1263

1264 def _read_int16(self) -> int:

1265 return struct.unpack(f"{self._byteorder}h", self._path_or_buf.read(2))[0]

1266

1267 def _read_int32(self) -> int:

1268 return struct.unpack(f"{self._byteorder}i", self._path_or_buf.read(4))[0]

1269

1270 def _read_int64(self) -> int:

1271 return struct.unpack(f"{self._byteorder}q", self._path_or_buf.read(8))[0]

1272

1273 def _read_char8(self) -> bytes:

1274 return struct.unpack("c", self._path_or_buf.read(1))[0]

1275

1276 def _read_int16_count(self, count: int) -> tuple[int, ...]:

1277 return struct.unpack(

1278 f"{self._byteorder}{'h' * count}",

1279 self._path_or_buf.read(2 * count),

1280 )

1281

1282 def _read_header(self) -> None:

1283 first_char = self._read_char8()

1284 if first_char == b"<":

1285 self._read_new_header()

1286 else:

1287 self._read_old_header(first_char)

1288

1289 def _read_new_header(self) -> None:

1290 # The first part of the header is common to 117 - 119.

1291 self._path_or_buf.read(27) # stata_dta><header><release>

1292 self._format_version = int(self._path_or_buf.read(3))

1293 if self._format_version not in [117, 118, 119]:

1294 raise ValueError(_version_error.format(version=self._format_version))

1295 self._set_encoding()

1296 self._path_or_buf.read(21) # </release><byteorder>

1297 self._byteorder = ">" if self._path_or_buf.read(3) == b"MSF" else "<"

1298 self._path_or_buf.read(15) # </byteorder><K>

1299 self._nvar = (

1300 self._read_uint16() if self._format_version <= 118 else self._read_uint32()

1301 )

1302 self._path_or_buf.read(7) # </K><N>

1303

1304 self._nobs = self._get_nobs()

1305 self._path_or_buf.read(11) # </N><label>

1306 self._data_label = self._get_data_label()

1307 self._path_or_buf.read(19) # </label><timestamp>

1308 self._time_stamp = self._get_time_stamp()

1309 self._path_or_buf.read(26) # </timestamp></header><map>

1310 self._path_or_buf.read(8) # 0x0000000000000000

1311 self._path_or_buf.read(8) # position of <map>

1312

1313 self._seek_vartypes = self._read_int64() + 16

1314 self._seek_varnames = self._read_int64() + 10

1315 self._seek_sortlist = self._read_int64() + 10

1316 self._seek_formats = self._read_int64() + 9

1317 self._seek_value_label_names = self._read_int64() + 19

1318

1319 # Requires version-specific treatment

1320 self._seek_variable_labels = self._get_seek_variable_labels()

1321

1322 self._path_or_buf.read(8) # <characteristics>

1323 self._data_location = self._read_int64() + 6

1324 self._seek_strls = self._read_int64() + 7

1325 self._seek_value_labels = self._read_int64() + 14

1326

1327 self._typlist, self._dtyplist = self._get_dtypes(self._seek_vartypes)

1328

1329 self._path_or_buf.seek(self._seek_varnames)

1330 self._varlist = self._get_varlist()

1331

1332 self._path_or_buf.seek(self._seek_sortlist)

1333 self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]

1334

1335 self._path_or_buf.seek(self._seek_formats)

1336 self._fmtlist = self._get_fmtlist()

1337

1338 self._path_or_buf.seek(self._seek_value_label_names)

1339 self._lbllist = self._get_lbllist()

1340

1341 self._path_or_buf.seek(self._seek_variable_labels)

1342 self._variable_labels = self._get_variable_labels()

1343

1344 # Get data type information, works for versions 117-119.

1345 def _get_dtypes(

1346 self, seek_vartypes: int

1347 ) -> tuple[list[int | str], list[str | np.dtype]]:

1348 self._path_or_buf.seek(seek_vartypes)

1349 typlist = []

1350 dtyplist = []

1351 for _ in range(self._nvar):

1352 typ = self._read_uint16()

1353 if typ <= 2045:

1354 typlist.append(typ)

1355 dtyplist.append(str(typ))

1356 else:

1357 try:

1358 typlist.append(self.TYPE_MAP_XML[typ]) # type: ignore[arg-type]

1359 dtyplist.append(self.DTYPE_MAP_XML[typ]) # type: ignore[arg-type]

1360 except KeyError as err:

1361 raise ValueError(f"cannot convert stata types [{typ}]") from err

1362

1363 return typlist, dtyplist # type: ignore[return-value]

1364

1365 def _get_varlist(self) -> list[str]:

1366 # 33 in order formats, 129 in formats 118 and 119

1367 b = 33 if self._format_version < 118 else 129

1368 return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]

1369

1370 # Returns the format list

1371 def _get_fmtlist(self) -> list[str]:

1372 if self._format_version >= 118:

1373 b = 57

1374 elif self._format_version > 113:

1375 b = 49

1376 elif self._format_version > 104:

1377 b = 12

1378 else:

1379 b = 7

1380

1381 return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]

1382

1383 # Returns the label list

1384 def _get_lbllist(self) -> list[str]:

1385 if self._format_version >= 118:

1386 b = 129

1387 elif self._format_version > 108:

1388 b = 33

1389 else:

1390 b = 9

1391 return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]

1392

1393 def _get_variable_labels(self) -> list[str]:

1394 if self._format_version >= 118:

1395 vlblist = [

1396 self._decode(self._path_or_buf.read(321)) for _ in range(self._nvar)

1397 ]

1398 elif self._format_version > 105:

1399 vlblist = [

1400 self._decode(self._path_or_buf.read(81)) for _ in range(self._nvar)

1401 ]

1402 else:

1403 vlblist = [

1404 self._decode(self._path_or_buf.read(32)) for _ in range(self._nvar)

1405 ]

1406 return vlblist

1407

1408 def _get_nobs(self) -> int:

1409 if self._format_version >= 118:

1410 return self._read_uint64()

1411 else:

1412 return self._read_uint32()

1413

1414 def _get_data_label(self) -> str:

1415 if self._format_version >= 118:

1416 strlen = self._read_uint16()

1417 return self._decode(self._path_or_buf.read(strlen))

1418 elif self._format_version == 117:

1419 strlen = self._read_int8()

1420 return self._decode(self._path_or_buf.read(strlen))

1421 elif self._format_version > 105:

1422 return self._decode(self._path_or_buf.read(81))

1423 else:

1424 return self._decode(self._path_or_buf.read(32))

1425

1426 def _get_time_stamp(self) -> str:

1427 if self._format_version >= 118:

1428 strlen = self._read_int8()

1429 return self._path_or_buf.read(strlen).decode("utf-8")

1430 elif self._format_version == 117:

1431 strlen = self._read_int8()

1432 return self._decode(self._path_or_buf.read(strlen))

1433 elif self._format_version > 104:

1434 return self._decode(self._path_or_buf.read(18))

1435 else:

1436 raise ValueError()

1437

1438 def _get_seek_variable_labels(self) -> int:

1439 if self._format_version == 117:

1440 self._path_or_buf.read(8) # <variable_labels>, throw away

1441 # Stata 117 data files do not follow the described format. This is

1442 # a work around that uses the previous label, 33 bytes for each

1443 # variable, 20 for the closing tag and 17 for the opening tag

1444 return self._seek_value_label_names + (33 * self._nvar) + 20 + 17

1445 elif self._format_version >= 118:

1446 return self._read_int64() + 17

1447 else:

1448 raise ValueError()

1449

1450 def _read_old_header(self, first_char: bytes) -> None:

1451 self._format_version = int(first_char[0])

1452 if self._format_version not in [104, 105, 108, 111, 113, 114, 115]:

1453 raise ValueError(_version_error.format(version=self._format_version))

1454 self._set_encoding()

1455 self._byteorder = ">" if self._read_int8() == 0x1 else "<"

1456 self._filetype = self._read_int8()

1457 self._path_or_buf.read(1) # unused

1458

1459 self._nvar = self._read_uint16()

1460 self._nobs = self._get_nobs()

1461

1462 self._data_label = self._get_data_label()

1463

1464 self._time_stamp = self._get_time_stamp()

1465

1466 # descriptors

1467 if self._format_version > 108:

1468 typlist = [int(c) for c in self._path_or_buf.read(self._nvar)]

1469 else:

1470 buf = self._path_or_buf.read(self._nvar)

1471 typlistb = np.frombuffer(buf, dtype=np.uint8)

1472 typlist = []

1473 for tp in typlistb:

1474 if tp in self.OLD_TYPE_MAPPING:

1475 typlist.append(self.OLD_TYPE_MAPPING[tp])

1476 else:

1477 typlist.append(tp - 127) # bytes

1478

1479 try:

1480 self._typlist = [self.TYPE_MAP[typ] for typ in typlist]

1481 except ValueError as err:

1482 invalid_types = ",".join([str(x) for x in typlist])

1483 raise ValueError(f"cannot convert stata types [{invalid_types}]") from err

1484 try:

1485 self._dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]

1486 except ValueError as err:

1487 invalid_dtypes = ",".join([str(x) for x in typlist])

1488 raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err

1489

1490 if self._format_version > 108:

1491 self._varlist = [

1492 self._decode(self._path_or_buf.read(33)) for _ in range(self._nvar)

1493 ]

1494 else:

1495 self._varlist = [

1496 self._decode(self._path_or_buf.read(9)) for _ in range(self._nvar)

1497 ]

1498 self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]

1499

1500 self._fmtlist = self._get_fmtlist()

1501

1502 self._lbllist = self._get_lbllist()

1503

1504 self._variable_labels = self._get_variable_labels()

1505

1506 # ignore expansion fields (Format 105 and later)

1507 # When reading, read five bytes; the last four bytes now tell you

1508 # the size of the next read, which you discard. You then continue

1509 # like this until you read 5 bytes of zeros.

1510

1511 if self._format_version > 104:

1512 while True:

1513 data_type = self._read_int8()

1514 if self._format_version > 108:

1515 data_len = self._read_int32()

1516 else:

1517 data_len = self._read_int16()

1518 if data_type == 0:

1519 break

1520 self._path_or_buf.read(data_len)

1521

1522 # necessary data to continue parsing

1523 self._data_location = self._path_or_buf.tell()

1524

1525 def _setup_dtype(self) -> np.dtype:

1526 """Map between numpy and state dtypes"""

1527 if self._dtype is not None:

1528 return self._dtype

1529

1530 dtypes = [] # Convert struct data types to numpy data type

1531 for i, typ in enumerate(self._typlist):

1532 if typ in self.NUMPY_TYPE_MAP:

1533 typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP

1534 dtypes.append((f"s{i}", f"{self._byteorder}{self.NUMPY_TYPE_MAP[typ]}"))

1535 else:

1536 dtypes.append((f"s{i}", f"S{typ}"))

1537 self._dtype = np.dtype(dtypes)

1538

1539 return self._dtype

1540

1541 def _decode(self, s: bytes) -> str:

1542 # have bytes not strings, so must decode

1543 s = s.partition(b"\0")[0]

1544 try:

1545 return s.decode(self._encoding)

1546 except UnicodeDecodeError:

1547 # GH 25960, fallback to handle incorrect format produced when 117

1548 # files are converted to 118 files in Stata

1549 encoding = self._encoding

1550 msg = f"""

1551One or more strings in the dta file could not be decoded using {encoding}, and

1552so the fallback encoding of latin-1 is being used. This can happen when a file

1553has been incorrectly encoded by Stata or some other software. You should verify

1554the string values returned are correct."""

1555 warnings.warn(

1556 msg,

1557 UnicodeWarning,

1558 stacklevel=find_stack_level(),

1559 )

1560 return s.decode("latin-1")

1561

1562 def _read_value_labels(self) -> None:

1563 self._ensure_open()

1564 if self._value_labels_read:

1565 # Don't read twice

1566 return

1567 if self._format_version <= 108:

1568 # Value labels are not supported in version 108 and earlier.

1569 self._value_labels_read = True

1570 self._value_label_dict: dict[str, dict[float, str]] = {}

1571 return

1572

1573 if self._format_version >= 117:

1574 self._path_or_buf.seek(self._seek_value_labels)

1575 else:

1576 assert self._dtype is not None

1577 offset = self._nobs * self._dtype.itemsize

1578 self._path_or_buf.seek(self._data_location + offset)

1579

1580 self._value_labels_read = True

1581 self._value_label_dict = {}

1582

1583 while True:

1584 if self._format_version >= 117:

1585 if self._path_or_buf.read(5) == b"</val": # <lbl>

1586 break # end of value label table

1587

1588 slength = self._path_or_buf.read(4)

1589 if not slength:

1590 break # end of value label table (format < 117)

1591 if self._format_version <= 117:

1592 labname = self._decode(self._path_or_buf.read(33))

1593 else:

1594 labname = self._decode(self._path_or_buf.read(129))

1595 self._path_or_buf.read(3) # padding

1596

1597 n = self._read_uint32()

1598 txtlen = self._read_uint32()

1599 off = np.frombuffer(

1600 self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n

1601 )

1602 val = np.frombuffer(

1603 self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n

1604 )

1605 ii = np.argsort(off)

1606 off = off[ii]

1607 val = val[ii]

1608 txt = self._path_or_buf.read(txtlen)

1609 self._value_label_dict[labname] = {}

1610 for i in range(n):

1611 end = off[i + 1] if i < n - 1 else txtlen

1612 self._value_label_dict[labname][val[i]] = self._decode(

1613 txt[off[i] : end]

1614 )

1615 if self._format_version >= 117:

1616 self._path_or_buf.read(6) # </lbl>

1617 self._value_labels_read = True

1618

1619 def _read_strls(self) -> None:

1620 self._path_or_buf.seek(self._seek_strls)

1621 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS

1622 self.GSO = {"0": ""}

1623 while True:

1624 if self._path_or_buf.read(3) != b"GSO":

1625 break

1626

1627 if self._format_version == 117:

1628 v_o = self._read_uint64()

1629 else:

1630 buf = self._path_or_buf.read(12)

1631 # Only tested on little endian file on little endian machine.

1632 v_size = 2 if self._format_version == 118 else 3

1633 if self._byteorder == "<":

1634 buf = buf[0:v_size] + buf[4 : (12 - v_size)]

1635 else:

1636 # This path may not be correct, impossible to test

1637 buf = buf[0:v_size] + buf[(4 + v_size) :]

1638 v_o = struct.unpack("Q", buf)[0]

1639 typ = self._read_uint8()

1640 length = self._read_uint32()

1641 va = self._path_or_buf.read(length)

1642 if typ == 130:

1643 decoded_va = va[0:-1].decode(self._encoding)

1644 else:

1645 # Stata says typ 129 can be binary, so use str

1646 decoded_va = str(va)

1647 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS

1648 self.GSO[str(v_o)] = decoded_va

1649

1650 def __next__(self) -> DataFrame:

1651 self._using_iterator = True

1652 return self.read(nrows=self._chunksize)

1653

1654 def get_chunk(self, size: int | None = None) -> DataFrame:

1655 """

1656 Reads lines from Stata file and returns as dataframe

1657

1658 Parameters

1659 ----------

1660 size : int, defaults to None

1661 Number of lines to read. If None, reads whole file.

1662

1663 Returns

1664 -------

1665 DataFrame

1666 """

1667 if size is None:

1668 size = self._chunksize

1669 return self.read(nrows=size)

1670

1671 @Appender(_read_method_doc)

1672 def read(

1673 self,

1674 nrows: int | None = None,

1675 convert_dates: bool | None = None,

1676 convert_categoricals: bool | None = None,

1677 index_col: str | None = None,

1678 convert_missing: bool | None = None,

1679 preserve_dtypes: bool | None = None,

1680 columns: Sequence[str] | None = None,

1681 order_categoricals: bool | None = None,

1682 ) -> DataFrame:

1683 self._ensure_open()

1684

1685 # Handle options

1686 if convert_dates is None:

1687 convert_dates = self._convert_dates

1688 if convert_categoricals is None:

1689 convert_categoricals = self._convert_categoricals

1690 if convert_missing is None:

1691 convert_missing = self._convert_missing

1692 if preserve_dtypes is None:

1693 preserve_dtypes = self._preserve_dtypes

1694 if columns is None:

1695 columns = self._columns

1696 if order_categoricals is None:

1697 order_categoricals = self._order_categoricals

1698 if index_col is None:

1699 index_col = self._index_col

1700 if nrows is None:

1701 nrows = self._nobs

1702

1703 # Handle empty file or chunk. If reading incrementally raise

1704 # StopIteration. If reading the whole thing return an empty

1705 # data frame.

1706 if (self._nobs == 0) and nrows == 0:

1707 self._can_read_value_labels = True

1708 self._data_read = True

1709 data = DataFrame(columns=self._varlist)

1710 # Apply dtypes correctly

1711 for i, col in enumerate(data.columns):

1712 dt = self._dtyplist[i]

1713 if isinstance(dt, np.dtype):

1714 if dt.char != "S":

1715 data[col] = data[col].astype(dt)

1716 if columns is not None:

1717 data = self._do_select_columns(data, columns)

1718 return data

1719

1720 if (self._format_version >= 117) and (not self._value_labels_read):

1721 self._can_read_value_labels = True

1722 self._read_strls()

1723

1724 # Read data

1725 assert self._dtype is not None

1726 dtype = self._dtype

1727 max_read_len = (self._nobs - self._lines_read) * dtype.itemsize

1728 read_len = nrows * dtype.itemsize

1729 read_len = min(read_len, max_read_len)

1730 if read_len <= 0:

1731 # Iterator has finished, should never be here unless

1732 # we are reading the file incrementally

1733 if convert_categoricals:

1734 self._read_value_labels()

1735 raise StopIteration

1736 offset = self._lines_read * dtype.itemsize

1737 self._path_or_buf.seek(self._data_location + offset)

1738 read_lines = min(nrows, self._nobs - self._lines_read)

1739 raw_data = np.frombuffer(

1740 self._path_or_buf.read(read_len), dtype=dtype, count=read_lines

1741 )

1742

1743 self._lines_read += read_lines

1744 if self._lines_read == self._nobs:

1745 self._can_read_value_labels = True

1746 self._data_read = True

1747 # if necessary, swap the byte order to native here

1748 if self._byteorder != self._native_byteorder:

1749 raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder())

1750

1751 if convert_categoricals:

1752 self._read_value_labels()

1753

1754 if len(raw_data) == 0:

1755 data = DataFrame(columns=self._varlist)

1756 else:

1757 data = DataFrame.from_records(raw_data)

1758 data.columns = Index(self._varlist)

1759

1760 # If index is not specified, use actual row number rather than

1761 # restarting at 0 for each chunk.

1762 if index_col is None:

1763 data.index = RangeIndex(

1764 self._lines_read - read_lines, self._lines_read

1765 ) # set attr instead of set_index to avoid copy

1766

1767 if columns is not None:

1768 data = self._do_select_columns(data, columns)

1769

1770 # Decode strings

1771 for col, typ in zip(data, self._typlist):

1772 if isinstance(typ, int):

1773 data[col] = data[col].apply(self._decode)

1774

1775 data = self._insert_strls(data)

1776

1777 # Convert columns (if needed) to match input type

1778 valid_dtypes = [i for i, dtyp in enumerate(self._dtyplist) if dtyp is not None]

1779 object_type = np.dtype(object)

1780 for idx in valid_dtypes:

1781 dtype = data.iloc[:, idx].dtype

1782 if dtype not in (object_type, self._dtyplist[idx]):

1783 data.isetitem(idx, data.iloc[:, idx].astype(dtype))

1784

1785 data = self._do_convert_missing(data, convert_missing)

1786

1787 if convert_dates:

1788 for i, fmt in enumerate(self._fmtlist):

1789 if any(fmt.startswith(date_fmt) for date_fmt in _date_formats):

1790 data.isetitem(

1791 i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)

1792 )

1793

1794 if convert_categoricals and self._format_version > 108:

1795 data = self._do_convert_categoricals(

1796 data, self._value_label_dict, self._lbllist, order_categoricals

1797 )

1798

1799 if not preserve_dtypes:

1800 retyped_data = []

1801 convert = False

1802 for col in data:

1803 dtype = data[col].dtype

1804 if dtype in (np.dtype(np.float16), np.dtype(np.float32)):

1805 dtype = np.dtype(np.float64)

1806 convert = True

1807 elif dtype in (

1808 np.dtype(np.int8),

1809 np.dtype(np.int16),

1810 np.dtype(np.int32),

1811 ):

1812 dtype = np.dtype(np.int64)

1813 convert = True

1814 retyped_data.append((col, data[col].astype(dtype)))

1815 if convert:

1816 data = DataFrame.from_dict(dict(retyped_data))

1817

1818 if index_col is not None:

1819 data = data.set_index(data.pop(index_col))

1820

1821 return data

1822

1823 def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:

1824 # Check for missing values, and replace if found

1825 replacements = {}

1826 for i in range(len(data.columns)):

1827 fmt = self._typlist[i]

1828 if fmt not in self.VALID_RANGE:

1829 continue

1830

1831 fmt = cast(str, fmt) # only strs in VALID_RANGE

1832 nmin, nmax = self.VALID_RANGE[fmt]

1833 series = data.iloc[:, i]

1834

1835 # appreciably faster to do this with ndarray instead of Series

1836 svals = series._values

1837 missing = (svals < nmin) | (svals > nmax)

1838

1839 if not missing.any():

1840 continue

1841

1842 if convert_missing: # Replacement follows Stata notation

1843 missing_loc = np.nonzero(np.asarray(missing))[0]

1844 umissing, umissing_loc = np.unique(series[missing], return_inverse=True)

1845 replacement = Series(series, dtype=object)

1846 for j, um in enumerate(umissing):

1847 missing_value = StataMissingValue(um)

1848

1849 loc = missing_loc[umissing_loc == j]

1850 replacement.iloc[loc] = missing_value

1851 else: # All replacements are identical

1852 dtype = series.dtype

1853 if dtype not in (np.float32, np.float64):

1854 dtype = np.float64

1855 replacement = Series(series, dtype=dtype)

1856 if not replacement._values.flags["WRITEABLE"]:

1857 # only relevant for ArrayManager; construction

1858 # path for BlockManager ensures writeability

1859 replacement = replacement.copy()

1860 # Note: operating on ._values is much faster than directly

1861 # TODO: can we fix that?

1862 replacement._values[missing] = np.nan

1863 replacements[i] = replacement

1864 if replacements:

1865 for idx, value in replacements.items():

1866 data.isetitem(idx, value)

1867 return data

1868

1869 def _insert_strls(self, data: DataFrame) -> DataFrame:

1870 if not hasattr(self, "GSO") or len(self.GSO) == 0:

1871 return data

1872 for i, typ in enumerate(self._typlist):

1873 if typ != "Q":

1874 continue

1875 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS

1876 data.isetitem(i, [self.GSO[str(k)] for k in data.iloc[:, i]])

1877 return data

1878

1879 def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame:

1880 if not self._column_selector_set:

1881 column_set = set(columns)

1882 if len(column_set) != len(columns):

1883 raise ValueError("columns contains duplicate entries")

1884 unmatched = column_set.difference(data.columns)

1885 if unmatched:

1886 joined = ", ".join(list(unmatched))

1887 raise ValueError(

1888 "The following columns were not "

1889 f"found in the Stata data set: {joined}"

1890 )

1891 # Copy information for retained columns for later processing

1892 dtyplist = []

1893 typlist = []

1894 fmtlist = []

1895 lbllist = []

1896 for col in columns:

1897 i = data.columns.get_loc(col)

1898 dtyplist.append(self._dtyplist[i])

1899 typlist.append(self._typlist[i])

1900 fmtlist.append(self._fmtlist[i])

1901 lbllist.append(self._lbllist[i])

1902

1903 self._dtyplist = dtyplist

1904 self._typlist = typlist

1905 self._fmtlist = fmtlist

1906 self._lbllist = lbllist

1907 self._column_selector_set = True

1908

1909 return data[columns]

1910

1911 def _do_convert_categoricals(

1912 self,

1913 data: DataFrame,

1914 value_label_dict: dict[str, dict[float, str]],

1915 lbllist: Sequence[str],

1916 order_categoricals: bool,

1917 ) -> DataFrame:

1918 """

1919 Converts categorical columns to Categorical type.

1920 """

1921 if not value_label_dict:

1922 return data

1923 cat_converted_data = []

1924 for col, label in zip(data, lbllist):

1925 if label in value_label_dict:

1926 # Explicit call with ordered=True

1927 vl = value_label_dict[label]

1928 keys = np.array(list(vl.keys()))

1929 column = data[col]

1930 key_matches = column.isin(keys)

1931 if self._using_iterator and key_matches.all():

1932 initial_categories: np.ndarray | None = keys

1933 # If all categories are in the keys and we are iterating,

1934 # use the same keys for all chunks. If some are missing

1935 # value labels, then we will fall back to the categories

1936 # varying across chunks.

1937 else:

1938 if self._using_iterator:

1939 # warn is using an iterator

1940 warnings.warn(

1941 categorical_conversion_warning,

1942 CategoricalConversionWarning,

1943 stacklevel=find_stack_level(),

1944 )

1945 initial_categories = None

1946 cat_data = Categorical(

1947 column, categories=initial_categories, ordered=order_categoricals

1948 )

1949 if initial_categories is None:

1950 # If None here, then we need to match the cats in the Categorical

1951 categories = []

1952 for category in cat_data.categories:

1953 if category in vl:

1954 categories.append(vl[category])

1955 else:

1956 categories.append(category)

1957 else:

1958 # If all cats are matched, we can use the values

1959 categories = list(vl.values())

1960 try:

1961 # Try to catch duplicate categories

1962 # TODO: if we get a non-copying rename_categories, use that

1963 cat_data = cat_data.rename_categories(categories)

1964 except ValueError as err:

1965 vc = Series(categories, copy=False).value_counts()

1966 repeated_cats = list(vc.index[vc > 1])

1967 repeats = "-" * 80 + "\n" + "\n".join(repeated_cats)

1968 # GH 25772

1969 msg = f"""

1970Value labels for column {col} are not unique. These cannot be converted to

1971pandas categoricals.

1972

1973Either read the file with `convert_categoricals` set to False or use the

1974low level interface in `StataReader` to separately read the values and the

1975value_labels.

1976

1977The repeated labels are:

1978{repeats}

1979"""

1980 raise ValueError(msg) from err

1981 # TODO: is the next line needed above in the data(...) method?

1982 cat_series = Series(cat_data, index=data.index, copy=False)

1983 cat_converted_data.append((col, cat_series))

1984 else:

1985 cat_converted_data.append((col, data[col]))

1986 data = DataFrame(dict(cat_converted_data), copy=False)

1987 return data

1988

1989 @property

1990 def data_label(self) -> str:

1991 """

1992 Return data label of Stata file.

1993

1994 Examples

1995 --------

1996 >>> df = pd.DataFrame([(1,)], columns=["variable"])

1997 >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)

1998 >>> data_label = "This is a data file."

1999 >>> path = "/My_path/filename.dta"

2000 >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP

2001 ... data_label=data_label, # doctest: +SKIP

2002 ... version=None) # doctest: +SKIP

2003 >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP

2004 ... print(reader.data_label) # doctest: +SKIP

2005 This is a data file.

2006 """

2007 self._ensure_open()

2008 return self._data_label

2009

2010 @property

2011 def time_stamp(self) -> str:

2012 """

2013 Return time stamp of Stata file.

2014 """

2015 self._ensure_open()

2016 return self._time_stamp

2017

2018 def variable_labels(self) -> dict[str, str]:

2019 """

2020 Return a dict associating each variable name with corresponding label.

2021

2022 Returns

2023 -------

2024 dict

2025

2026 Examples

2027 --------

2028 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"])

2029 >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)

2030 >>> path = "/My_path/filename.dta"

2031 >>> variable_labels = {"col_1": "This is an example"}

2032 >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP

2033 ... variable_labels=variable_labels, version=None) # doctest: +SKIP

2034 >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP

2035 ... print(reader.variable_labels()) # doctest: +SKIP

2036 {'index': '', 'col_1': 'This is an example', 'col_2': ''}

2037 >>> pd.read_stata(path) # doctest: +SKIP

2038 index col_1 col_2

2039 0 0 1 2

2040 1 1 3 4

2041 """

2042 self._ensure_open()

2043 return dict(zip(self._varlist, self._variable_labels))

2044

2045 def value_labels(self) -> dict[str, dict[float, str]]:

2046 """

2047 Return a nested dict associating each variable name to its value and label.

2048

2049 Returns

2050 -------

2051 dict

2052

2053 Examples

2054 --------

2055 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"])

2056 >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)

2057 >>> path = "/My_path/filename.dta"

2058 >>> value_labels = {"col_1": {3: "x"}}

2059 >>> df.to_stata(path, time_stamp=time_stamp, # doctest: +SKIP

2060 ... value_labels=value_labels, version=None) # doctest: +SKIP

2061 >>> with pd.io.stata.StataReader(path) as reader: # doctest: +SKIP

2062 ... print(reader.value_labels()) # doctest: +SKIP

2063 {'col_1': {3: 'x'}}

2064 >>> pd.read_stata(path) # doctest: +SKIP

2065 index col_1 col_2

2066 0 0 1 2

2067 1 1 x 4

2068 """

2069 if not self._value_labels_read:

2070 self._read_value_labels()

2071

2072 return self._value_label_dict

2073

2074

2075@Appender(_read_stata_doc)

2076def read_stata(

2077 filepath_or_buffer: FilePath | ReadBuffer[bytes],

2078 *,

2079 convert_dates: bool = True,

2080 convert_categoricals: bool = True,

2081 index_col: str | None = None,

2082 convert_missing: bool = False,

2083 preserve_dtypes: bool = True,

2084 columns: Sequence[str] | None = None,

2085 order_categoricals: bool = True,

2086 chunksize: int | None = None,

2087 iterator: bool = False,

2088 compression: CompressionOptions = "infer",

2089 storage_options: StorageOptions | None = None,

2090) -> DataFrame | StataReader:

2091 reader = StataReader(

2092 filepath_or_buffer,

2093 convert_dates=convert_dates,

2094 convert_categoricals=convert_categoricals,

2095 index_col=index_col,

2096 convert_missing=convert_missing,

2097 preserve_dtypes=preserve_dtypes,

2098 columns=columns,

2099 order_categoricals=order_categoricals,

2100 chunksize=chunksize,

2101 storage_options=storage_options,

2102 compression=compression,

2103 )

2104

2105 if iterator or chunksize:

2106 return reader

2107

2108 with reader:

2109 return reader.read()

2110

2111

2112def _set_endianness(endianness: str) -> str:

2113 if endianness.lower() in ["<", "little"]:

2114 return "<"

2115 elif endianness.lower() in [">", "big"]:

2116 return ">"

2117 else: # pragma : no cover

2118 raise ValueError(f"Endianness {endianness} not understood")

2119

2120

2121def _pad_bytes(name: AnyStr, length: int) -> AnyStr:

2122 """

2123 Take a char string and pads it with null bytes until it's length chars.

2124 """

2125 if isinstance(name, bytes):

2126 return name + b"\x00" * (length - len(name))

2127 return name + "\x00" * (length - len(name))

2128

2129

2130def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:

2131 """

2132 Convert from one of the stata date formats to a type in TYPE_MAP.

2133 """

2134 if fmt in [

2135 "tc",

2136 "%tc",

2137 "td",

2138 "%td",

2139 "tw",

2140 "%tw",

2141 "tm",

2142 "%tm",

2143 "tq",

2144 "%tq",

2145 "th",

2146 "%th",

2147 "ty",

2148 "%ty",

2149 ]:

2150 return np.dtype(np.float64) # Stata expects doubles for SIFs

2151 else:

2152 raise NotImplementedError(f"Format {fmt} not implemented")

2153

2154

2155def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict:

2156 new_dict = {}

2157 for key in convert_dates:

2158 if not convert_dates[key].startswith("%"): # make sure proper fmts

2159 convert_dates[key] = "%" + convert_dates[key]

2160 if key in varlist:

2161 new_dict.update({varlist.index(key): convert_dates[key]})

2162 else:

2163 if not isinstance(key, int):

2164 raise ValueError("convert_dates key must be a column or an integer")

2165 new_dict.update({key: convert_dates[key]})

2166 return new_dict

2167

2168

2169def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int:

2170 """

2171 Convert dtype types to stata types. Returns the byte of the given ordinal.

2172 See TYPE_MAP and comments for an explanation. This is also explained in

2173 the dta spec.

2174 1 - 244 are strings of this length

2175 Pandas Stata

2176 251 - for int8 byte

2177 252 - for int16 int

2178 253 - for int32 long

2179 254 - for float32 float

2180 255 - for double double

2181

2182 If there are dates to convert, then dtype will already have the correct

2183 type inserted.

2184 """

2185 # TODO: expand to handle datetime to integer conversion

2186 if dtype.type is np.object_: # try to coerce it to the biggest string

2187 # not memory efficient, what else could we

2188 # do?

2189 itemsize = max_len_string_array(ensure_object(column._values))

2190 return max(itemsize, 1)

2191 elif dtype.type is np.float64:

2192 return 255

2193 elif dtype.type is np.float32:

2194 return 254

2195 elif dtype.type is np.int32:

2196 return 253

2197 elif dtype.type is np.int16:

2198 return 252

2199 elif dtype.type is np.int8:

2200 return 251

2201 else: # pragma : no cover

2202 raise NotImplementedError(f"Data type {dtype} not supported.")

2203

2204

2205def _dtype_to_default_stata_fmt(

2206 dtype, column: Series, dta_version: int = 114, force_strl: bool = False

2207) -> str:

2208 """

2209 Map numpy dtype to stata's default format for this type. Not terribly

2210 important since users can change this in Stata. Semantics are

2211

2212 object -> "%DDs" where DD is the length of the string. If not a string,

2213 raise ValueError

2214 float64 -> "%10.0g"

2215 float32 -> "%9.0g"

2216 int64 -> "%9.0g"

2217 int32 -> "%12.0g"

2218 int16 -> "%8.0g"

2219 int8 -> "%8.0g"

2220 strl -> "%9s"

2221 """

2222 # TODO: Refactor to combine type with format

2223 # TODO: expand this to handle a default datetime format?

2224 if dta_version < 117:

2225 max_str_len = 244

2226 else:

2227 max_str_len = 2045

2228 if force_strl:

2229 return "%9s"

2230 if dtype.type is np.object_:

2231 itemsize = max_len_string_array(ensure_object(column._values))

2232 if itemsize > max_str_len:

2233 if dta_version >= 117:

2234 return "%9s"

2235 else:

2236 raise ValueError(excessive_string_length_error.format(column.name))

2237 return "%" + str(max(itemsize, 1)) + "s"

2238 elif dtype == np.float64:

2239 return "%10.0g"

2240 elif dtype == np.float32:

2241 return "%9.0g"

2242 elif dtype == np.int32:

2243 return "%12.0g"

2244 elif dtype in (np.int8, np.int16):

2245 return "%8.0g"

2246 else: # pragma : no cover

2247 raise NotImplementedError(f"Data type {dtype} not supported.")

2248

2249

2250@doc(

2251 storage_options=_shared_docs["storage_options"],

2252 compression_options=_shared_docs["compression_options"] % "fname",

2253)

2254class StataWriter(StataParser):

2255 """

2256 A class for writing Stata binary dta files

2257

2258 Parameters

2259 ----------

2260 fname : path (string), buffer or path object

2261 string, path object (pathlib.Path or py._path.local.LocalPath) or

2262 object implementing a binary write() functions. If using a buffer

2263 then the buffer will not be automatically closed after the file

2264 is written.

2265 data : DataFrame

2266 Input to save

2267 convert_dates : dict

2268 Dictionary mapping columns containing datetime types to stata internal

2269 format to use when writing the dates. Options are 'tc', 'td', 'tm',

2270 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.

2271 Datetime columns that do not have a conversion type specified will be

2272 converted to 'tc'. Raises NotImplementedError if a datetime column has

2273 timezone information

2274 write_index : bool

2275 Write the index to Stata dataset.

2276 byteorder : str

2277 Can be ">", "<", "little", or "big". default is `sys.byteorder`

2278 time_stamp : datetime

2279 A datetime to use as file creation date. Default is the current time

2280 data_label : str

2281 A label for the data set. Must be 80 characters or smaller.

2282 variable_labels : dict

2283 Dictionary containing columns as keys and variable labels as values.

2284 Each label must be 80 characters or smaller.

2285 {compression_options}

2286

2287 .. versionchanged:: 1.4.0 Zstandard support.

2288

2289 {storage_options}

2290

2291 value_labels : dict of dicts

2292 Dictionary containing columns as keys and dictionaries of column value

2293 to labels as values. The combined length of all labels for a single

2294 variable must be 32,000 characters or smaller.

2295

2296 .. versionadded:: 1.4.0

2297

2298 Returns

2299 -------

2300 writer : StataWriter instance

2301 The StataWriter instance has a write_file method, which will

2302 write the file to the given `fname`.

2303

2304 Raises

2305 ------

2306 NotImplementedError

2307 * If datetimes contain timezone information

2308 ValueError

2309 * Columns listed in convert_dates are neither datetime64[ns]

2310 or datetime

2311 * Column dtype is not representable in Stata

2312 * Column listed in convert_dates is not in DataFrame

2313 * Categorical label contains more than 32,000 characters

2314

2315 Examples

2316 --------

2317 >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])

2318 >>> writer = StataWriter('./data_file.dta', data)

2319 >>> writer.write_file()

2320

2321 Directly write a zip file

2322 >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}}

2323 >>> writer = StataWriter('./data_file.zip', data, compression=compression)

2324 >>> writer.write_file()

2325

2326 Save a DataFrame with dates

2327 >>> from datetime import datetime

2328 >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])

2329 >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}})

2330 >>> writer.write_file()

2331 """

2332

2333 _max_string_length = 244

2334 _encoding: Literal["latin-1", "utf-8"] = "latin-1"

2335

2336 def __init__(

2337 self,

2338 fname: FilePath | WriteBuffer[bytes],

2339 data: DataFrame,

2340 convert_dates: dict[Hashable, str] | None = None,

2341 write_index: bool = True,

2342 byteorder: str | None = None,

2343 time_stamp: datetime | None = None,

2344 data_label: str | None = None,

2345 variable_labels: dict[Hashable, str] | None = None,

2346 compression: CompressionOptions = "infer",

2347 storage_options: StorageOptions | None = None,

2348 *,

2349 value_labels: dict[Hashable, dict[float, str]] | None = None,

2350 ) -> None:

2351 super().__init__()

2352 self.data = data

2353 self._convert_dates = {} if convert_dates is None else convert_dates

2354 self._write_index = write_index

2355 self._time_stamp = time_stamp

2356 self._data_label = data_label

2357 self._variable_labels = variable_labels

2358 self._non_cat_value_labels = value_labels

2359 self._value_labels: list[StataValueLabel] = []

2360 self._has_value_labels = np.array([], dtype=bool)

2361 self._compression = compression

2362 self._output_file: IO[bytes] | None = None

2363 self._converted_names: dict[Hashable, str] = {}

2364 # attach nobs, nvars, data, varlist, typlist

2365 self._prepare_pandas(data)

2366 self.storage_options = storage_options

2367

2368 if byteorder is None:

2369 byteorder = sys.byteorder

2370 self._byteorder = _set_endianness(byteorder)

2371 self._fname = fname

2372 self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}

2373

2374 def _write(self, to_write: str) -> None:

2375 """

2376 Helper to call encode before writing to file for Python 3 compat.

2377 """

2378 self.handles.handle.write(to_write.encode(self._encoding))

2379

2380 def _write_bytes(self, value: bytes) -> None:

2381 """

2382 Helper to assert file is open before writing.

2383 """

2384 self.handles.handle.write(value)

2385

2386 def _prepare_non_cat_value_labels(

2387 self, data: DataFrame

2388 ) -> list[StataNonCatValueLabel]:

2389 """

2390 Check for value labels provided for non-categorical columns. Value

2391 labels

2392 """

2393 non_cat_value_labels: list[StataNonCatValueLabel] = []

2394 if self._non_cat_value_labels is None:

2395 return non_cat_value_labels

2396

2397 for labname, labels in self._non_cat_value_labels.items():

2398 if labname in self._converted_names:

2399 colname = self._converted_names[labname]

2400 elif labname in data.columns:

2401 colname = str(labname)

2402 else:

2403 raise KeyError(

2404 f"Can't create value labels for {labname}, it wasn't "

2405 "found in the dataset."

2406 )

2407

2408 if not is_numeric_dtype(data[colname].dtype):

2409 # Labels should not be passed explicitly for categorical

2410 # columns that will be converted to int

2411 raise ValueError(

2412 f"Can't create value labels for {labname}, value labels "

2413 "can only be applied to numeric columns."

2414 )

2415 svl = StataNonCatValueLabel(colname, labels, self._encoding)

2416 non_cat_value_labels.append(svl)

2417 return non_cat_value_labels

2418

2419 def _prepare_categoricals(self, data: DataFrame) -> DataFrame:

2420 """

2421 Check for categorical columns, retain categorical information for

2422 Stata file and convert categorical data to int

2423 """

2424 is_cat = [isinstance(dtype, CategoricalDtype) for dtype in data.dtypes]

2425 if not any(is_cat):

2426 return data

2427

2428 self._has_value_labels |= np.array(is_cat)

2429

2430 get_base_missing_value = StataMissingValue.get_base_missing_value

2431 data_formatted = []

2432 for col, col_is_cat in zip(data, is_cat):

2433 if col_is_cat:

2434 svl = StataValueLabel(data[col], encoding=self._encoding)

2435 self._value_labels.append(svl)

2436 dtype = data[col].cat.codes.dtype

2437 if dtype == np.int64:

2438 raise ValueError(

2439 "It is not possible to export "

2440 "int64-based categorical data to Stata."

2441 )

2442 values = data[col].cat.codes._values.copy()

2443

2444 # Upcast if needed so that correct missing values can be set

2445 if values.max() >= get_base_missing_value(dtype):

2446 if dtype == np.int8:

2447 dtype = np.dtype(np.int16)

2448 elif dtype == np.int16:

2449 dtype = np.dtype(np.int32)

2450 else:

2451 dtype = np.dtype(np.float64)

2452 values = np.array(values, dtype=dtype)

2453

2454 # Replace missing values with Stata missing value for type

2455 values[values == -1] = get_base_missing_value(dtype)

2456 data_formatted.append((col, values))

2457 else:

2458 data_formatted.append((col, data[col]))

2459 return DataFrame.from_dict(dict(data_formatted))

2460

2461 def _replace_nans(self, data: DataFrame) -> DataFrame:

2462 # return data

2463 """

2464 Checks floating point data columns for nans, and replaces these with

2465 the generic Stata for missing value (.)

2466 """

2467 for c in data:

2468 dtype = data[c].dtype

2469 if dtype in (np.float32, np.float64):

2470 if dtype == np.float32:

2471 replacement = self.MISSING_VALUES["f"]

2472 else:

2473 replacement = self.MISSING_VALUES["d"]

2474 data[c] = data[c].fillna(replacement)

2475

2476 return data

2477

2478 def _update_strl_names(self) -> None:

2479 """No-op, forward compatibility"""

2480

2481 def _validate_variable_name(self, name: str) -> str:

2482 """

2483 Validate variable names for Stata export.

2484

2485 Parameters

2486 ----------

2487 name : str

2488 Variable name

2489

2490 Returns

2491 -------

2492 str

2493 The validated name with invalid characters replaced with

2494 underscores.

2495

2496 Notes

2497 -----

2498 Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9

2499 and _.

2500 """

2501 for c in name:

2502 if (

2503 (c < "A" or c > "Z")

2504 and (c < "a" or c > "z")

2505 and (c < "0" or c > "9")

2506 and c != "_"

2507 ):

2508 name = name.replace(c, "_")

2509 return name

2510

2511 def _check_column_names(self, data: DataFrame) -> DataFrame:

2512 """

2513 Checks column names to ensure that they are valid Stata column names.

2514 This includes checks for:

2515 * Non-string names

2516 * Stata keywords

2517 * Variables that start with numbers

2518 * Variables with names that are too long

2519

2520 When an illegal variable name is detected, it is converted, and if

2521 dates are exported, the variable name is propagated to the date

2522 conversion dictionary

2523 """

2524 converted_names: dict[Hashable, str] = {}

2525 columns = list(data.columns)

2526 original_columns = columns[:]

2527

2528 duplicate_var_id = 0

2529 for j, name in enumerate(columns):

2530 orig_name = name

2531 if not isinstance(name, str):

2532 name = str(name)

2533

2534 name = self._validate_variable_name(name)

2535

2536 # Variable name must not be a reserved word

2537 if name in self.RESERVED_WORDS:

2538 name = "_" + name

2539

2540 # Variable name may not start with a number

2541 if "0" <= name[0] <= "9":

2542 name = "_" + name

2543

2544 name = name[: min(len(name), 32)]

2545

2546 if not name == orig_name:

2547 # check for duplicates

2548 while columns.count(name) > 0:

2549 # prepend ascending number to avoid duplicates

2550 name = "_" + str(duplicate_var_id) + name

2551 name = name[: min(len(name), 32)]

2552 duplicate_var_id += 1

2553 converted_names[orig_name] = name

2554

2555 columns[j] = name

2556

2557 data.columns = Index(columns)

2558

2559 # Check date conversion, and fix key if needed

2560 if self._convert_dates:

2561 for c, o in zip(columns, original_columns):

2562 if c != o:

2563 self._convert_dates[c] = self._convert_dates[o]

2564 del self._convert_dates[o]

2565

2566 if converted_names:

2567 conversion_warning = []

2568 for orig_name, name in converted_names.items():

2569 msg = f"{orig_name} -> {name}"

2570 conversion_warning.append(msg)

2571

2572 ws = invalid_name_doc.format("\n ".join(conversion_warning))

2573 warnings.warn(

2574 ws,

2575 InvalidColumnName,

2576 stacklevel=find_stack_level(),

2577 )

2578

2579 self._converted_names = converted_names

2580 self._update_strl_names()

2581

2582 return data

2583

2584 def _set_formats_and_types(self, dtypes: Series) -> None:

2585 self.fmtlist: list[str] = []

2586 self.typlist: list[int] = []

2587 for col, dtype in dtypes.items():

2588 self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col]))

2589 self.typlist.append(_dtype_to_stata_type(dtype, self.data[col]))

2590

2591 def _prepare_pandas(self, data: DataFrame) -> None:

2592 # NOTE: we might need a different API / class for pandas objects so

2593 # we can set different semantics - handle this with a PR to pandas.io

2594

2595 data = data.copy()

2596

2597 if self._write_index:

2598 temp = data.reset_index()

2599 if isinstance(temp, DataFrame):

2600 data = temp

2601

2602 # Ensure column names are strings

2603 data = self._check_column_names(data)

2604

2605 # Check columns for compatibility with stata, upcast if necessary

2606 # Raise if outside the supported range

2607 data = _cast_to_stata_types(data)

2608

2609 # Replace NaNs with Stata missing values

2610 data = self._replace_nans(data)

2611

2612 # Set all columns to initially unlabelled

2613 self._has_value_labels = np.repeat(False, data.shape[1])

2614

2615 # Create value labels for non-categorical data

2616 non_cat_value_labels = self._prepare_non_cat_value_labels(data)

2617

2618 non_cat_columns = [svl.labname for svl in non_cat_value_labels]

2619 has_non_cat_val_labels = data.columns.isin(non_cat_columns)

2620 self._has_value_labels |= has_non_cat_val_labels

2621 self._value_labels.extend(non_cat_value_labels)

2622

2623 # Convert categoricals to int data, and strip labels

2624 data = self._prepare_categoricals(data)

2625

2626 self.nobs, self.nvar = data.shape

2627 self.data = data

2628 self.varlist = data.columns.tolist()

2629

2630 dtypes = data.dtypes

2631

2632 # Ensure all date columns are converted

2633 for col in data:

2634 if col in self._convert_dates:

2635 continue

2636 if lib.is_np_dtype(data[col].dtype, "M"):

2637 self._convert_dates[col] = "tc"

2638

2639 self._convert_dates = _maybe_convert_to_int_keys(

2640 self._convert_dates, self.varlist

2641 )

2642 for key in self._convert_dates:

2643 new_type = _convert_datetime_to_stata_type(self._convert_dates[key])

2644 dtypes.iloc[key] = np.dtype(new_type)

2645

2646 # Verify object arrays are strings and encode to bytes

2647 self._encode_strings()

2648

2649 self._set_formats_and_types(dtypes)

2650

2651 # set the given format for the datetime cols

2652 if self._convert_dates is not None:

2653 for key in self._convert_dates:

2654 if isinstance(key, int):

2655 self.fmtlist[key] = self._convert_dates[key]

2656

2657 def _encode_strings(self) -> None:

2658 """

2659 Encode strings in dta-specific encoding

2660

2661 Do not encode columns marked for date conversion or for strL

2662 conversion. The strL converter independently handles conversion and

2663 also accepts empty string arrays.

2664 """

2665 convert_dates = self._convert_dates

2666 # _convert_strl is not available in dta 114

2667 convert_strl = getattr(self, "_convert_strl", [])

2668 for i, col in enumerate(self.data):

2669 # Skip columns marked for date conversion or strl conversion

2670 if i in convert_dates or col in convert_strl:

2671 continue

2672 column = self.data[col]

2673 dtype = column.dtype

2674 if dtype.type is np.object_:

2675 inferred_dtype = infer_dtype(column, skipna=True)

2676 if not ((inferred_dtype == "string") or len(column) == 0):

2677 col = column.name

2678 raise ValueError(

2679 f"""\

2680Column `{col}` cannot be exported.\n\nOnly string-like object arrays

2681containing all strings or a mix of strings and None can be exported.

2682Object arrays containing only null values are prohibited. Other object

2683types cannot be exported and must first be converted to one of the

2684supported types."""

2685 )

2686 encoded = self.data[col].str.encode(self._encoding)

2687 # If larger than _max_string_length do nothing

2688 if (

2689 max_len_string_array(ensure_object(encoded._values))

2690 <= self._max_string_length

2691 ):

2692 self.data[col] = encoded

2693

2694 def write_file(self) -> None:

2695 """

2696 Export DataFrame object to Stata dta format.

2697

2698 Examples

2699 --------

2700 >>> df = pd.DataFrame({"fully_labelled": [1, 2, 3, 3, 1],

2701 ... "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan],

2702 ... "Y": [7, 7, 9, 8, 10],

2703 ... "Z": pd.Categorical(["j", "k", "l", "k", "j"]),

2704 ... })

2705 >>> path = "/My_path/filename.dta"

2706 >>> labels = {"fully_labelled": {1: "one", 2: "two", 3: "three"},

2707 ... "partially_labelled": {1.0: "one", 2.0: "two"},

2708 ... }

2709 >>> writer = pd.io.stata.StataWriter(path,

2710 ... df,

2711 ... value_labels=labels) # doctest: +SKIP

2712 >>> writer.write_file() # doctest: +SKIP

2713 >>> df = pd.read_stata(path) # doctest: +SKIP

2714 >>> df # doctest: +SKIP

2715 index fully_labelled partially_labeled Y Z

2716 0 0 one one 7 j

2717 1 1 two two 7 k

2718 2 2 three NaN 9 l

2719 3 3 three 9.0 8 k

2720 4 4 one NaN 10 j

2721 """

2722 with get_handle(

2723 self._fname,

2724 "wb",

2725 compression=self._compression,

2726 is_text=False,

2727 storage_options=self.storage_options,

2728 ) as self.handles:

2729 if self.handles.compression["method"] is not None:

2730 # ZipFile creates a file (with the same name) for each write call.

2731 # Write it first into a buffer and then write the buffer to the ZipFile.

2732 self._output_file, self.handles.handle = self.handles.handle, BytesIO()

2733 self.handles.created_handles.append(self.handles.handle)

2734

2735 try:

2736 self._write_header(

2737 data_label=self._data_label, time_stamp=self._time_stamp

2738 )

2739 self._write_map()

2740 self._write_variable_types()

2741 self._write_varnames()

2742 self._write_sortlist()

2743 self._write_formats()

2744 self._write_value_label_names()

2745 self._write_variable_labels()

2746 self._write_expansion_fields()

2747 self._write_characteristics()

2748 records = self._prepare_data()

2749 self._write_data(records)

2750 self._write_strls()

2751 self._write_value_labels()

2752 self._write_file_close_tag()

2753 self._write_map()

2754 self._close()

2755 except Exception as exc:

2756 self.handles.close()

2757 if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile(

2758 self._fname

2759 ):

2760 try:

2761 os.unlink(self._fname)

2762 except OSError:

2763 warnings.warn(

2764 f"This save was not successful but {self._fname} could not "

2765 "be deleted. This file is not valid.",

2766 ResourceWarning,

2767 stacklevel=find_stack_level(),

2768 )

2769 raise exc

2770

2771 def _close(self) -> None:

2772 """

2773 Close the file if it was created by the writer.

2774

2775 If a buffer or file-like object was passed in, for example a GzipFile,

2776 then leave this file open for the caller to close.

2777 """

2778 # write compression

2779 if self._output_file is not None:

2780 assert isinstance(self.handles.handle, BytesIO)

2781 bio, self.handles.handle = self.handles.handle, self._output_file

2782 self.handles.handle.write(bio.getvalue())

2783

2784 def _write_map(self) -> None:

2785 """No-op, future compatibility"""

2786

2787 def _write_file_close_tag(self) -> None:

2788 """No-op, future compatibility"""

2789

2790 def _write_characteristics(self) -> None:

2791 """No-op, future compatibility"""

2792

2793 def _write_strls(self) -> None:

2794 """No-op, future compatibility"""

2795

2796 def _write_expansion_fields(self) -> None:

2797 """Write 5 zeros for expansion fields"""

2798 self._write(_pad_bytes("", 5))

2799

2800 def _write_value_labels(self) -> None:

2801 for vl in self._value_labels:

2802 self._write_bytes(vl.generate_value_label(self._byteorder))

2803

2804 def _write_header(

2805 self,

2806 data_label: str | None = None,

2807 time_stamp: datetime | None = None,

2808 ) -> None:

2809 byteorder = self._byteorder

2810 # ds_format - just use 114

2811 self._write_bytes(struct.pack("b", 114))

2812 # byteorder

2813 self._write(byteorder == ">" and "\x01" or "\x02")

2814 # filetype

2815 self._write("\x01")

2816 # unused

2817 self._write("\x00")

2818 # number of vars, 2 bytes

2819 self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2])

2820 # number of obs, 4 bytes

2821 self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4])

2822 # data label 81 bytes, char, null terminated

2823 if data_label is None:

2824 self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80)))

2825 else:

2826 self._write_bytes(

2827 self._null_terminate_bytes(_pad_bytes(data_label[:80], 80))

2828 )

2829 # time stamp, 18 bytes, char, null terminated

2830 # format dd Mon yyyy hh:mm

2831 if time_stamp is None:

2832 time_stamp = datetime.now()

2833 elif not isinstance(time_stamp, datetime):

2834 raise ValueError("time_stamp should be datetime type")

2835 # GH #13856

2836 # Avoid locale-specific month conversion

2837 months = [

2838 "Jan",

2839 "Feb",

2840 "Mar",

2841 "Apr",

2842 "May",

2843 "Jun",

2844 "Jul",

2845 "Aug",

2846 "Sep",

2847 "Oct",

2848 "Nov",

2849 "Dec",

2850 ]

2851 month_lookup = {i + 1: month for i, month in enumerate(months)}

2852 ts = (

2853 time_stamp.strftime("%d ")

2854 + month_lookup[time_stamp.month]

2855 + time_stamp.strftime(" %Y %H:%M")

2856 )

2857 self._write_bytes(self._null_terminate_bytes(ts))

2858

2859 def _write_variable_types(self) -> None:

2860 for typ in self.typlist:

2861 self._write_bytes(struct.pack("B", typ))

2862

2863 def _write_varnames(self) -> None:

2864 # varlist names are checked by _check_column_names

2865 # varlist, requires null terminated

2866 for name in self.varlist:

2867 name = self._null_terminate_str(name)

2868 name = _pad_bytes(name[:32], 33)

2869 self._write(name)

2870

2871 def _write_sortlist(self) -> None:

2872 # srtlist, 2*(nvar+1), int array, encoded by byteorder

2873 srtlist = _pad_bytes("", 2 * (self.nvar + 1))

2874 self._write(srtlist)

2875

2876 def _write_formats(self) -> None:

2877 # fmtlist, 49*nvar, char array

2878 for fmt in self.fmtlist:

2879 self._write(_pad_bytes(fmt, 49))

2880

2881 def _write_value_label_names(self) -> None:

2882 # lbllist, 33*nvar, char array

2883 for i in range(self.nvar):

2884 # Use variable name when categorical

2885 if self._has_value_labels[i]:

2886 name = self.varlist[i]

2887 name = self._null_terminate_str(name)

2888 name = _pad_bytes(name[:32], 33)

2889 self._write(name)

2890 else: # Default is empty label

2891 self._write(_pad_bytes("", 33))

2892

2893 def _write_variable_labels(self) -> None:

2894 # Missing labels are 80 blank characters plus null termination

2895 blank = _pad_bytes("", 81)

2896

2897 if self._variable_labels is None:

2898 for i in range(self.nvar):

2899 self._write(blank)

2900 return

2901

2902 for col in self.data:

2903 if col in self._variable_labels:

2904 label = self._variable_labels[col]

2905 if len(label) > 80:

2906 raise ValueError("Variable labels must be 80 characters or fewer")

2907 is_latin1 = all(ord(c) < 256 for c in label)

2908 if not is_latin1:

2909 raise ValueError(

2910 "Variable labels must contain only characters that "

2911 "can be encoded in Latin-1"

2912 )

2913 self._write(_pad_bytes(label, 81))

2914 else:

2915 self._write(blank)

2916

2917 def _convert_strls(self, data: DataFrame) -> DataFrame:

2918 """No-op, future compatibility"""

2919 return data

2920

2921 def _prepare_data(self) -> np.rec.recarray:

2922 data = self.data

2923 typlist = self.typlist

2924 convert_dates = self._convert_dates

2925 # 1. Convert dates

2926 if self._convert_dates is not None:

2927 for i, col in enumerate(data):

2928 if i in convert_dates:

2929 data[col] = _datetime_to_stata_elapsed_vec(

2930 data[col], self.fmtlist[i]

2931 )

2932 # 2. Convert strls

2933 data = self._convert_strls(data)

2934

2935 # 3. Convert bad string data to '' and pad to correct length

2936 dtypes = {}

2937 native_byteorder = self._byteorder == _set_endianness(sys.byteorder)

2938 for i, col in enumerate(data):

2939 typ = typlist[i]

2940 if typ <= self._max_string_length:

2941 with warnings.catch_warnings():

2942 warnings.filterwarnings(

2943 "ignore",

2944 "Downcasting object dtype arrays",

2945 category=FutureWarning,

2946 )

2947 dc = data[col].fillna("")

2948 data[col] = dc.apply(_pad_bytes, args=(typ,))

2949 stype = f"S{typ}"

2950 dtypes[col] = stype

2951 data[col] = data[col].astype(stype)

2952 else:

2953 dtype = data[col].dtype

2954 if not native_byteorder:

2955 dtype = dtype.newbyteorder(self._byteorder)

2956 dtypes[col] = dtype

2957

2958 return data.to_records(index=False, column_dtypes=dtypes)

2959

2960 def _write_data(self, records: np.rec.recarray) -> None:

2961 self._write_bytes(records.tobytes())

2962

2963 @staticmethod

2964 def _null_terminate_str(s: str) -> str:

2965 s += "\x00"

2966 return s

2967

2968 def _null_terminate_bytes(self, s: str) -> bytes:

2969 return self._null_terminate_str(s).encode(self._encoding)

2970

2971

2972def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int:

2973 """

2974 Converts dtype types to stata types. Returns the byte of the given ordinal.

2975 See TYPE_MAP and comments for an explanation. This is also explained in

2976 the dta spec.

2977 1 - 2045 are strings of this length

2978 Pandas Stata

2979 32768 - for object strL

2980 65526 - for int8 byte

2981 65527 - for int16 int

2982 65528 - for int32 long

2983 65529 - for float32 float

2984 65530 - for double double

2985

2986 If there are dates to convert, then dtype will already have the correct

2987 type inserted.

2988 """

2989 # TODO: expand to handle datetime to integer conversion

2990 if force_strl:

2991 return 32768

2992 if dtype.type is np.object_: # try to coerce it to the biggest string

2993 # not memory efficient, what else could we

2994 # do?

2995 itemsize = max_len_string_array(ensure_object(column._values))

2996 itemsize = max(itemsize, 1)

2997 if itemsize <= 2045:

2998 return itemsize

2999 return 32768

3000 elif dtype.type is np.float64:

3001 return 65526

3002 elif dtype.type is np.float32:

3003 return 65527

3004 elif dtype.type is np.int32:

3005 return 65528

3006 elif dtype.type is np.int16:

3007 return 65529

3008 elif dtype.type is np.int8:

3009 return 65530

3010 else: # pragma : no cover

3011 raise NotImplementedError(f"Data type {dtype} not supported.")

3012

3013

3014def _pad_bytes_new(name: str | bytes, length: int) -> bytes:

3015 """

3016 Takes a bytes instance and pads it with null bytes until it's length chars.

3017 """

3018 if isinstance(name, str):

3019 name = bytes(name, "utf-8")

3020 return name + b"\x00" * (length - len(name))

3021

3022

3023class StataStrLWriter:

3024 """

3025 Converter for Stata StrLs

3026

3027 Stata StrLs map 8 byte values to strings which are stored using a

3028 dictionary-like format where strings are keyed to two values.

3029

3030 Parameters

3031 ----------

3032 df : DataFrame

3033 DataFrame to convert

3034 columns : Sequence[str]

3035 List of columns names to convert to StrL

3036 version : int, optional

3037 dta version. Currently supports 117, 118 and 119

3038 byteorder : str, optional

3039 Can be ">", "<", "little", or "big". default is `sys.byteorder`

3040

3041 Notes

3042 -----

3043 Supports creation of the StrL block of a dta file for dta versions

3044 117, 118 and 119. These differ in how the GSO is stored. 118 and

3045 119 store the GSO lookup value as a uint32 and a uint64, while 117

3046 uses two uint32s. 118 and 119 also encode all strings as unicode

3047 which is required by the format. 117 uses 'latin-1' a fixed width

3048 encoding that extends the 7-bit ascii table with an additional 128

3049 characters.

3050 """

3051

3052 def __init__(

3053 self,

3054 df: DataFrame,

3055 columns: Sequence[str],

3056 version: int = 117,

3057 byteorder: str | None = None,

3058 ) -> None:

3059 if version not in (117, 118, 119):

3060 raise ValueError("Only dta versions 117, 118 and 119 supported")

3061 self._dta_ver = version

3062

3063 self.df = df

3064 self.columns = columns

3065 self._gso_table = {"": (0, 0)}

3066 if byteorder is None:

3067 byteorder = sys.byteorder

3068 self._byteorder = _set_endianness(byteorder)

3069

3070 gso_v_type = "I" # uint32

3071 gso_o_type = "Q" # uint64

3072 self._encoding = "utf-8"

3073 if version == 117:

3074 o_size = 4

3075 gso_o_type = "I" # 117 used uint32

3076 self._encoding = "latin-1"

3077 elif version == 118:

3078 o_size = 6

3079 else: # version == 119

3080 o_size = 5

3081 self._o_offet = 2 ** (8 * (8 - o_size))

3082 self._gso_o_type = gso_o_type

3083 self._gso_v_type = gso_v_type

3084

3085 def _convert_key(self, key: tuple[int, int]) -> int:

3086 v, o = key

3087 return v + self._o_offet * o

3088

3089 def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:

3090 """

3091 Generates the GSO lookup table for the DataFrame

3092

3093 Returns

3094 -------

3095 gso_table : dict

3096 Ordered dictionary using the string found as keys

3097 and their lookup position (v,o) as values

3098 gso_df : DataFrame

3099 DataFrame where strl columns have been converted to

3100 (v,o) values

3101

3102 Notes

3103 -----

3104 Modifies the DataFrame in-place.

3105

3106 The DataFrame returned encodes the (v,o) values as uint64s. The

3107 encoding depends on the dta version, and can be expressed as

3108

3109 enc = v + o * 2 ** (o_size * 8)

3110

3111 so that v is stored in the lower bits and o is in the upper

3112 bits. o_size is

3113

3114 * 117: 4

3115 * 118: 6

3116 * 119: 5

3117 """

3118 gso_table = self._gso_table

3119 gso_df = self.df

3120 columns = list(gso_df.columns)

3121 selected = gso_df[self.columns]

3122 col_index = [(col, columns.index(col)) for col in self.columns]

3123 keys = np.empty(selected.shape, dtype=np.uint64)

3124 for o, (idx, row) in enumerate(selected.iterrows()):

3125 for j, (col, v) in enumerate(col_index):

3126 val = row[col]

3127 # Allow columns with mixed str and None (GH 23633)

3128 val = "" if val is None else val

3129 key = gso_table.get(val, None)

3130 if key is None:

3131 # Stata prefers human numbers

3132 key = (v + 1, o + 1)

3133 gso_table[val] = key

3134 keys[o, j] = self._convert_key(key)

3135 for i, col in enumerate(self.columns):

3136 gso_df[col] = keys[:, i]

3137

3138 return gso_table, gso_df

3139

3140 def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:

3141 """

3142 Generates the binary blob of GSOs that is written to the dta file.

3143

3144 Parameters

3145 ----------

3146 gso_table : dict

3147 Ordered dictionary (str, vo)

3148

3149 Returns

3150 -------

3151 gso : bytes

3152 Binary content of dta file to be placed between strl tags

3153

3154 Notes

3155 -----

3156 Output format depends on dta version. 117 uses two uint32s to

3157 express v and o while 118+ uses a uint32 for v and a uint64 for o.

3158 """

3159 # Format information

3160 # Length includes null term

3161 # 117

3162 # GSOvvvvooootllllxxxxxxxxxxxxxxx...x

3163 # 3 u4 u4 u1 u4 string + null term

3164 #

3165 # 118, 119

3166 # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x

3167 # 3 u4 u8 u1 u4 string + null term

3168

3169 bio = BytesIO()

3170 gso = bytes("GSO", "ascii")

3171 gso_type = struct.pack(self._byteorder + "B", 130)

3172 null = struct.pack(self._byteorder + "B", 0)

3173 v_type = self._byteorder + self._gso_v_type

3174 o_type = self._byteorder + self._gso_o_type

3175 len_type = self._byteorder + "I"

3176 for strl, vo in gso_table.items():

3177 if vo == (0, 0):

3178 continue

3179 v, o = vo

3180

3181 # GSO

3182 bio.write(gso)

3183

3184 # vvvv

3185 bio.write(struct.pack(v_type, v))

3186

3187 # oooo / oooooooo

3188 bio.write(struct.pack(o_type, o))

3189

3190 # t

3191 bio.write(gso_type)

3192

3193 # llll

3194 utf8_string = bytes(strl, "utf-8")

3195 bio.write(struct.pack(len_type, len(utf8_string) + 1))

3196

3197 # xxx...xxx

3198 bio.write(utf8_string)

3199 bio.write(null)

3200

3201 return bio.getvalue()

3202

3203

3204class StataWriter117(StataWriter):

3205 """

3206 A class for writing Stata binary dta files in Stata 13 format (117)

3207

3208 Parameters

3209 ----------

3210 fname : path (string), buffer or path object

3211 string, path object (pathlib.Path or py._path.local.LocalPath) or

3212 object implementing a binary write() functions. If using a buffer

3213 then the buffer will not be automatically closed after the file

3214 is written.

3215 data : DataFrame

3216 Input to save

3217 convert_dates : dict

3218 Dictionary mapping columns containing datetime types to stata internal

3219 format to use when writing the dates. Options are 'tc', 'td', 'tm',

3220 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.

3221 Datetime columns that do not have a conversion type specified will be

3222 converted to 'tc'. Raises NotImplementedError if a datetime column has

3223 timezone information

3224 write_index : bool

3225 Write the index to Stata dataset.

3226 byteorder : str

3227 Can be ">", "<", "little", or "big". default is `sys.byteorder`

3228 time_stamp : datetime

3229 A datetime to use as file creation date. Default is the current time

3230 data_label : str

3231 A label for the data set. Must be 80 characters or smaller.

3232 variable_labels : dict

3233 Dictionary containing columns as keys and variable labels as values.

3234 Each label must be 80 characters or smaller.

3235 convert_strl : list

3236 List of columns names to convert to Stata StrL format. Columns with

3237 more than 2045 characters are automatically written as StrL.

3238 Smaller columns can be converted by including the column name. Using

3239 StrLs can reduce output file size when strings are longer than 8

3240 characters, and either frequently repeated or sparse.

3241 {compression_options}

3242

3243 .. versionchanged:: 1.4.0 Zstandard support.

3244

3245 value_labels : dict of dicts

3246 Dictionary containing columns as keys and dictionaries of column value

3247 to labels as values. The combined length of all labels for a single

3248 variable must be 32,000 characters or smaller.

3249

3250 .. versionadded:: 1.4.0

3251

3252 Returns

3253 -------

3254 writer : StataWriter117 instance

3255 The StataWriter117 instance has a write_file method, which will

3256 write the file to the given `fname`.

3257

3258 Raises

3259 ------

3260 NotImplementedError

3261 * If datetimes contain timezone information

3262 ValueError

3263 * Columns listed in convert_dates are neither datetime64[ns]

3264 or datetime

3265 * Column dtype is not representable in Stata

3266 * Column listed in convert_dates is not in DataFrame

3267 * Categorical label contains more than 32,000 characters

3268

3269 Examples

3270 --------

3271 >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])

3272 >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data)

3273 >>> writer.write_file()

3274

3275 Directly write a zip file

3276 >>> compression = {"method": "zip", "archive_name": "data_file.dta"}

3277 >>> writer = pd.io.stata.StataWriter117(

3278 ... './data_file.zip', data, compression=compression

3279 ... )

3280 >>> writer.write_file()

3281

3282 Or with long strings stored in strl format

3283 >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],

3284 ... columns=['strls'])

3285 >>> writer = pd.io.stata.StataWriter117(

3286 ... './data_file_with_long_strings.dta', data, convert_strl=['strls'])

3287 >>> writer.write_file()

3288 """

3289

3290 _max_string_length = 2045

3291 _dta_version = 117

3292

3293 def __init__(

3294 self,

3295 fname: FilePath | WriteBuffer[bytes],

3296 data: DataFrame,

3297 convert_dates: dict[Hashable, str] | None = None,

3298 write_index: bool = True,

3299 byteorder: str | None = None,

3300 time_stamp: datetime | None = None,

3301 data_label: str | None = None,

3302 variable_labels: dict[Hashable, str] | None = None,

3303 convert_strl: Sequence[Hashable] | None = None,

3304 compression: CompressionOptions = "infer",

3305 storage_options: StorageOptions | None = None,

3306 *,

3307 value_labels: dict[Hashable, dict[float, str]] | None = None,

3308 ) -> None:

3309 # Copy to new list since convert_strl might be modified later

3310 self._convert_strl: list[Hashable] = []

3311 if convert_strl is not None:

3312 self._convert_strl.extend(convert_strl)

3313

3314 super().__init__(

3315 fname,

3316 data,

3317 convert_dates,

3318 write_index,

3319 byteorder=byteorder,

3320 time_stamp=time_stamp,

3321 data_label=data_label,

3322 variable_labels=variable_labels,

3323 value_labels=value_labels,

3324 compression=compression,

3325 storage_options=storage_options,

3326 )

3327 self._map: dict[str, int] = {}

3328 self._strl_blob = b""

3329

3330 @staticmethod

3331 def _tag(val: str | bytes, tag: str) -> bytes:

3332 """Surround val with <tag></tag>"""

3333 if isinstance(val, str):

3334 val = bytes(val, "utf-8")

3335 return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8")

3336

3337 def _update_map(self, tag: str) -> None:

3338 """Update map location for tag with file position"""

3339 assert self.handles.handle is not None

3340 self._map[tag] = self.handles.handle.tell()

3341

3342 def _write_header(

3343 self,

3344 data_label: str | None = None,

3345 time_stamp: datetime | None = None,

3346 ) -> None:

3347 """Write the file header"""

3348 byteorder = self._byteorder

3349 self._write_bytes(bytes("<stata_dta>", "utf-8"))

3350 bio = BytesIO()

3351 # ds_format - 117

3352 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))

3353 # byteorder

3354 bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))

3355 # number of vars, 2 bytes in 117 and 118, 4 byte in 119

3356 nvar_type = "H" if self._dta_version <= 118 else "I"

3357 bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))

3358 # 117 uses 4 bytes, 118 uses 8

3359 nobs_size = "I" if self._dta_version == 117 else "Q"

3360 bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))

3361 # data label 81 bytes, char, null terminated

3362 label = data_label[:80] if data_label is not None else ""

3363 encoded_label = label.encode(self._encoding)

3364 label_size = "B" if self._dta_version == 117 else "H"

3365 label_len = struct.pack(byteorder + label_size, len(encoded_label))

3366 encoded_label = label_len + encoded_label

3367 bio.write(self._tag(encoded_label, "label"))

3368 # time stamp, 18 bytes, char, null terminated

3369 # format dd Mon yyyy hh:mm

3370 if time_stamp is None:

3371 time_stamp = datetime.now()

3372 elif not isinstance(time_stamp, datetime):

3373 raise ValueError("time_stamp should be datetime type")

3374 # Avoid locale-specific month conversion

3375 months = [

3376 "Jan",

3377 "Feb",

3378 "Mar",

3379 "Apr",

3380 "May",

3381 "Jun",

3382 "Jul",

3383 "Aug",

3384 "Sep",

3385 "Oct",

3386 "Nov",

3387 "Dec",

3388 ]

3389 month_lookup = {i + 1: month for i, month in enumerate(months)}

3390 ts = (

3391 time_stamp.strftime("%d ")

3392 + month_lookup[time_stamp.month]

3393 + time_stamp.strftime(" %Y %H:%M")

3394 )

3395 # '\x11' added due to inspection of Stata file

3396 stata_ts = b"\x11" + bytes(ts, "utf-8")

3397 bio.write(self._tag(stata_ts, "timestamp"))

3398 self._write_bytes(self._tag(bio.getvalue(), "header"))

3399

3400 def _write_map(self) -> None:

3401 """

3402 Called twice during file write. The first populates the values in

3403 the map with 0s. The second call writes the final map locations when

3404 all blocks have been written.

3405 """

3406 if not self._map:

3407 self._map = {

3408 "stata_data": 0,

3409 "map": self.handles.handle.tell(),

3410 "variable_types": 0,

3411 "varnames": 0,

3412 "sortlist": 0,

3413 "formats": 0,

3414 "value_label_names": 0,

3415 "variable_labels": 0,

3416 "characteristics": 0,

3417 "data": 0,

3418 "strls": 0,

3419 "value_labels": 0,

3420 "stata_data_close": 0,

3421 "end-of-file": 0,

3422 }

3423 # Move to start of map

3424 self.handles.handle.seek(self._map["map"])

3425 bio = BytesIO()

3426 for val in self._map.values():

3427 bio.write(struct.pack(self._byteorder + "Q", val))

3428 self._write_bytes(self._tag(bio.getvalue(), "map"))

3429

3430 def _write_variable_types(self) -> None:

3431 self._update_map("variable_types")

3432 bio = BytesIO()

3433 for typ in self.typlist:

3434 bio.write(struct.pack(self._byteorder + "H", typ))

3435 self._write_bytes(self._tag(bio.getvalue(), "variable_types"))

3436

3437 def _write_varnames(self) -> None:

3438 self._update_map("varnames")

3439 bio = BytesIO()

3440 # 118 scales by 4 to accommodate utf-8 data worst case encoding

3441 vn_len = 32 if self._dta_version == 117 else 128

3442 for name in self.varlist:

3443 name = self._null_terminate_str(name)

3444 name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1)

3445 bio.write(name)

3446 self._write_bytes(self._tag(bio.getvalue(), "varnames"))

3447

3448 def _write_sortlist(self) -> None:

3449 self._update_map("sortlist")

3450 sort_size = 2 if self._dta_version < 119 else 4

3451 self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))

3452

3453 def _write_formats(self) -> None:

3454 self._update_map("formats")

3455 bio = BytesIO()

3456 fmt_len = 49 if self._dta_version == 117 else 57

3457 for fmt in self.fmtlist:

3458 bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len))

3459 self._write_bytes(self._tag(bio.getvalue(), "formats"))

3460

3461 def _write_value_label_names(self) -> None:

3462 self._update_map("value_label_names")

3463 bio = BytesIO()

3464 # 118 scales by 4 to accommodate utf-8 data worst case encoding

3465 vl_len = 32 if self._dta_version == 117 else 128

3466 for i in range(self.nvar):

3467 # Use variable name when categorical

3468 name = "" # default name

3469 if self._has_value_labels[i]:

3470 name = self.varlist[i]

3471 name = self._null_terminate_str(name)

3472 encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1)

3473 bio.write(encoded_name)

3474 self._write_bytes(self._tag(bio.getvalue(), "value_label_names"))

3475

3476 def _write_variable_labels(self) -> None:

3477 # Missing labels are 80 blank characters plus null termination

3478 self._update_map("variable_labels")

3479 bio = BytesIO()

3480 # 118 scales by 4 to accommodate utf-8 data worst case encoding

3481 vl_len = 80 if self._dta_version == 117 else 320

3482 blank = _pad_bytes_new("", vl_len + 1)

3483

3484 if self._variable_labels is None:

3485 for _ in range(self.nvar):

3486 bio.write(blank)

3487 self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))

3488 return

3489

3490 for col in self.data:

3491 if col in self._variable_labels:

3492 label = self._variable_labels[col]

3493 if len(label) > 80:

3494 raise ValueError("Variable labels must be 80 characters or fewer")

3495 try:

3496 encoded = label.encode(self._encoding)

3497 except UnicodeEncodeError as err:

3498 raise ValueError(

3499 "Variable labels must contain only characters that "

3500 f"can be encoded in {self._encoding}"

3501 ) from err

3502

3503 bio.write(_pad_bytes_new(encoded, vl_len + 1))

3504 else:

3505 bio.write(blank)

3506 self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))

3507

3508 def _write_characteristics(self) -> None:

3509 self._update_map("characteristics")

3510 self._write_bytes(self._tag(b"", "characteristics"))

3511

3512 def _write_data(self, records) -> None:

3513 self._update_map("data")

3514 self._write_bytes(b"<data>")

3515 self._write_bytes(records.tobytes())

3516 self._write_bytes(b"</data>")

3517

3518 def _write_strls(self) -> None:

3519 self._update_map("strls")

3520 self._write_bytes(self._tag(self._strl_blob, "strls"))

3521

3522 def _write_expansion_fields(self) -> None:

3523 """No-op in dta 117+"""

3524

3525 def _write_value_labels(self) -> None:

3526 self._update_map("value_labels")

3527 bio = BytesIO()

3528 for vl in self._value_labels:

3529 lab = vl.generate_value_label(self._byteorder)

3530 lab = self._tag(lab, "lbl")

3531 bio.write(lab)

3532 self._write_bytes(self._tag(bio.getvalue(), "value_labels"))

3533

3534 def _write_file_close_tag(self) -> None:

3535 self._update_map("stata_data_close")

3536 self._write_bytes(bytes("</stata_dta>", "utf-8"))

3537 self._update_map("end-of-file")

3538

3539 def _update_strl_names(self) -> None:

3540 """

3541 Update column names for conversion to strl if they might have been

3542 changed to comply with Stata naming rules

3543 """

3544 # Update convert_strl if names changed

3545 for orig, new in self._converted_names.items():

3546 if orig in self._convert_strl:

3547 idx = self._convert_strl.index(orig)

3548 self._convert_strl[idx] = new

3549

3550 def _convert_strls(self, data: DataFrame) -> DataFrame:

3551 """

3552 Convert columns to StrLs if either very large or in the

3553 convert_strl variable

3554 """

3555 convert_cols = [

3556 col

3557 for i, col in enumerate(data)

3558 if self.typlist[i] == 32768 or col in self._convert_strl

3559 ]

3560

3561 if convert_cols:

3562 ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)

3563 tab, new_data = ssw.generate_table()

3564 data = new_data

3565 self._strl_blob = ssw.generate_blob(tab)

3566 return data

3567

3568 def _set_formats_and_types(self, dtypes: Series) -> None:

3569 self.typlist = []

3570 self.fmtlist = []

3571 for col, dtype in dtypes.items():

3572 force_strl = col in self._convert_strl

3573 fmt = _dtype_to_default_stata_fmt(

3574 dtype,

3575 self.data[col],

3576 dta_version=self._dta_version,

3577 force_strl=force_strl,

3578 )

3579 self.fmtlist.append(fmt)

3580 self.typlist.append(

3581 _dtype_to_stata_type_117(dtype, self.data[col], force_strl)

3582 )

3583

3584

3585class StataWriterUTF8(StataWriter117):

3586 """

3587 Stata binary dta file writing in Stata 15 (118) and 16 (119) formats

3588

3589 DTA 118 and 119 format files support unicode string data (both fixed

3590 and strL) format. Unicode is also supported in value labels, variable

3591 labels and the dataset label. Format 119 is automatically used if the

3592 file contains more than 32,767 variables.

3593

3594 Parameters

3595 ----------

3596 fname : path (string), buffer or path object

3597 string, path object (pathlib.Path or py._path.local.LocalPath) or

3598 object implementing a binary write() functions. If using a buffer

3599 then the buffer will not be automatically closed after the file

3600 is written.

3601 data : DataFrame

3602 Input to save

3603 convert_dates : dict, default None

3604 Dictionary mapping columns containing datetime types to stata internal

3605 format to use when writing the dates. Options are 'tc', 'td', 'tm',

3606 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.

3607 Datetime columns that do not have a conversion type specified will be

3608 converted to 'tc'. Raises NotImplementedError if a datetime column has

3609 timezone information

3610 write_index : bool, default True

3611 Write the index to Stata dataset.

3612 byteorder : str, default None

3613 Can be ">", "<", "little", or "big". default is `sys.byteorder`

3614 time_stamp : datetime, default None

3615 A datetime to use as file creation date. Default is the current time

3616 data_label : str, default None

3617 A label for the data set. Must be 80 characters or smaller.

3618 variable_labels : dict, default None

3619 Dictionary containing columns as keys and variable labels as values.

3620 Each label must be 80 characters or smaller.

3621 convert_strl : list, default None

3622 List of columns names to convert to Stata StrL format. Columns with

3623 more than 2045 characters are automatically written as StrL.

3624 Smaller columns can be converted by including the column name. Using

3625 StrLs can reduce output file size when strings are longer than 8

3626 characters, and either frequently repeated or sparse.

3627 version : int, default None

3628 The dta version to use. By default, uses the size of data to determine

3629 the version. 118 is used if data.shape[1] <= 32767, and 119 is used

3630 for storing larger DataFrames.

3631 {compression_options}

3632

3633 .. versionchanged:: 1.4.0 Zstandard support.

3634

3635 value_labels : dict of dicts

3636 Dictionary containing columns as keys and dictionaries of column value

3637 to labels as values. The combined length of all labels for a single

3638 variable must be 32,000 characters or smaller.

3639

3640 .. versionadded:: 1.4.0

3641

3642 Returns

3643 -------

3644 StataWriterUTF8

3645 The instance has a write_file method, which will write the file to the

3646 given `fname`.

3647

3648 Raises

3649 ------

3650 NotImplementedError

3651 * If datetimes contain timezone information

3652 ValueError

3653 * Columns listed in convert_dates are neither datetime64[ns]

3654 or datetime

3655 * Column dtype is not representable in Stata

3656 * Column listed in convert_dates is not in DataFrame

3657 * Categorical label contains more than 32,000 characters

3658

3659 Examples

3660 --------

3661 Using Unicode data and column names

3662

3663 >>> from pandas.io.stata import StataWriterUTF8

3664 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])

3665 >>> writer = StataWriterUTF8('./data_file.dta', data)

3666 >>> writer.write_file()

3667

3668 Directly write a zip file

3669 >>> compression = {"method": "zip", "archive_name": "data_file.dta"}

3670 >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression)

3671 >>> writer.write_file()

3672

3673 Or with long strings stored in strl format

3674

3675 >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],

3676 ... columns=['strls'])

3677 >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,

3678 ... convert_strl=['strls'])

3679 >>> writer.write_file()

3680 """

3681

3682 _encoding: Literal["utf-8"] = "utf-8"

3683

3684 def __init__(

3685 self,

3686 fname: FilePath | WriteBuffer[bytes],

3687 data: DataFrame,

3688 convert_dates: dict[Hashable, str] | None = None,

3689 write_index: bool = True,

3690 byteorder: str | None = None,

3691 time_stamp: datetime | None = None,

3692 data_label: str | None = None,

3693 variable_labels: dict[Hashable, str] | None = None,

3694 convert_strl: Sequence[Hashable] | None = None,

3695 version: int | None = None,

3696 compression: CompressionOptions = "infer",

3697 storage_options: StorageOptions | None = None,

3698 *,

3699 value_labels: dict[Hashable, dict[float, str]] | None = None,

3700 ) -> None:

3701 if version is None:

3702 version = 118 if data.shape[1] <= 32767 else 119

3703 elif version not in (118, 119):

3704 raise ValueError("version must be either 118 or 119.")

3705 elif version == 118 and data.shape[1] > 32767:

3706 raise ValueError(

3707 "You must use version 119 for data sets containing more than"

3708 "32,767 variables"

3709 )

3710

3711 super().__init__(

3712 fname,

3713 data,

3714 convert_dates=convert_dates,

3715 write_index=write_index,

3716 byteorder=byteorder,

3717 time_stamp=time_stamp,

3718 data_label=data_label,

3719 variable_labels=variable_labels,

3720 value_labels=value_labels,

3721 convert_strl=convert_strl,

3722 compression=compression,

3723 storage_options=storage_options,

3724 )

3725 # Override version set in StataWriter117 init

3726 self._dta_version = version

3727

3728 def _validate_variable_name(self, name: str) -> str:

3729 """

3730 Validate variable names for Stata export.

3731

3732 Parameters

3733 ----------

3734 name : str

3735 Variable name

3736

3737 Returns

3738 -------

3739 str

3740 The validated name with invalid characters replaced with

3741 underscores.

3742

3743 Notes

3744 -----

3745 Stata 118+ support most unicode characters. The only limitation is in

3746 the ascii range where the characters supported are a-z, A-Z, 0-9 and _.

3747 """

3748 # High code points appear to be acceptable

3749 for c in name:

3750 if (

3751 (

3752 ord(c) < 128

3753 and (c < "A" or c > "Z")

3754 and (c < "a" or c > "z")

3755 and (c < "0" or c > "9")

3756 and c != "_"

3757 )

3758 or 128 <= ord(c) < 192

3759 or c in {"×", "÷"} # noqa: RUF001

3760 ):

3761 name = name.replace(c, "_")

3762

3763 return name