1"""
2Internal module for formatting output data in csv, html, xml,
3and latex files. This module also applies to display formatting.
4"""
5from __future__ import annotations
6
7from collections.abc import (
8 Generator,
9 Hashable,
10 Mapping,
11 Sequence,
12)
13from contextlib import contextmanager
14from csv import QUOTE_NONE
15from decimal import Decimal
16from functools import partial
17from io import StringIO
18import math
19import re
20from shutil import get_terminal_size
21from typing import (
22 TYPE_CHECKING,
23 Any,
24 Callable,
25 Final,
26 cast,
27)
28
29import numpy as np
30
31from pandas._config.config import (
32 get_option,
33 set_option,
34)
35
36from pandas._libs import lib
37from pandas._libs.missing import NA
38from pandas._libs.tslibs import (
39 NaT,
40 Timedelta,
41 Timestamp,
42)
43from pandas._libs.tslibs.nattype import NaTType
44
45from pandas.core.dtypes.common import (
46 is_complex_dtype,
47 is_float,
48 is_integer,
49 is_list_like,
50 is_numeric_dtype,
51 is_scalar,
52)
53from pandas.core.dtypes.dtypes import (
54 CategoricalDtype,
55 DatetimeTZDtype,
56 ExtensionDtype,
57)
58from pandas.core.dtypes.missing import (
59 isna,
60 notna,
61)
62
63from pandas.core.arrays import (
64 Categorical,
65 DatetimeArray,
66 ExtensionArray,
67 TimedeltaArray,
68)
69from pandas.core.arrays.string_ import StringDtype
70from pandas.core.base import PandasObject
71import pandas.core.common as com
72from pandas.core.indexes.api import (
73 Index,
74 MultiIndex,
75 PeriodIndex,
76 ensure_index,
77)
78from pandas.core.indexes.datetimes import DatetimeIndex
79from pandas.core.indexes.timedeltas import TimedeltaIndex
80from pandas.core.reshape.concat import concat
81
82from pandas.io.common import (
83 check_parent_directory,
84 stringify_path,
85)
86from pandas.io.formats import printing
87
88if TYPE_CHECKING:
89 from pandas._typing import (
90 ArrayLike,
91 Axes,
92 ColspaceArgType,
93 ColspaceType,
94 CompressionOptions,
95 FilePath,
96 FloatFormatType,
97 FormattersType,
98 IndexLabel,
99 SequenceNotStr,
100 StorageOptions,
101 WriteBuffer,
102 )
103
104 from pandas import (
105 DataFrame,
106 Series,
107 )
108
109
110common_docstring: Final = """
111 Parameters
112 ----------
113 buf : str, Path or StringIO-like, optional, default None
114 Buffer to write to. If None, the output is returned as a string.
115 columns : array-like, optional, default None
116 The subset of columns to write. Writes all columns by default.
117 col_space : %(col_space_type)s, optional
118 %(col_space)s.
119 header : %(header_type)s, optional
120 %(header)s.
121 index : bool, optional, default True
122 Whether to print index (row) labels.
123 na_rep : str, optional, default 'NaN'
124 String representation of ``NaN`` to use.
125 formatters : list, tuple or dict of one-param. functions, optional
126 Formatter functions to apply to columns' elements by position or
127 name.
128 The result of each function must be a unicode string.
129 List/tuple must be of length equal to the number of columns.
130 float_format : one-parameter function, optional, default None
131 Formatter function to apply to columns' elements if they are
132 floats. This function must return a unicode string and will be
133 applied only to the non-``NaN`` elements, with ``NaN`` being
134 handled by ``na_rep``.
135 sparsify : bool, optional, default True
136 Set to False for a DataFrame with a hierarchical index to print
137 every multiindex key at each row.
138 index_names : bool, optional, default True
139 Prints the names of the indexes.
140 justify : str, default None
141 How to justify the column labels. If None uses the option from
142 the print configuration (controlled by set_option), 'right' out
143 of the box. Valid values are
144
145 * left
146 * right
147 * center
148 * justify
149 * justify-all
150 * start
151 * end
152 * inherit
153 * match-parent
154 * initial
155 * unset.
156 max_rows : int, optional
157 Maximum number of rows to display in the console.
158 max_cols : int, optional
159 Maximum number of columns to display in the console.
160 show_dimensions : bool, default False
161 Display DataFrame dimensions (number of rows by number of columns).
162 decimal : str, default '.'
163 Character recognized as decimal separator, e.g. ',' in Europe.
164 """
165
166VALID_JUSTIFY_PARAMETERS = (
167 "left",
168 "right",
169 "center",
170 "justify",
171 "justify-all",
172 "start",
173 "end",
174 "inherit",
175 "match-parent",
176 "initial",
177 "unset",
178)
179
180return_docstring: Final = """
181 Returns
182 -------
183 str or None
184 If buf is None, returns the result as a string. Otherwise returns
185 None.
186 """
187
188
189class SeriesFormatter:
190 """
191 Implement the main logic of Series.to_string, which underlies
192 Series.__repr__.
193 """
194
195 def __init__(
196 self,
197 series: Series,
198 *,
199 length: bool | str = True,
200 header: bool = True,
201 index: bool = True,
202 na_rep: str = "NaN",
203 name: bool = False,
204 float_format: str | None = None,
205 dtype: bool = True,
206 max_rows: int | None = None,
207 min_rows: int | None = None,
208 ) -> None:
209 self.series = series
210 self.buf = StringIO()
211 self.name = name
212 self.na_rep = na_rep
213 self.header = header
214 self.length = length
215 self.index = index
216 self.max_rows = max_rows
217 self.min_rows = min_rows
218
219 if float_format is None:
220 float_format = get_option("display.float_format")
221 self.float_format = float_format
222 self.dtype = dtype
223 self.adj = printing.get_adjustment()
224
225 self._chk_truncate()
226
227 def _chk_truncate(self) -> None:
228 self.tr_row_num: int | None
229
230 min_rows = self.min_rows
231 max_rows = self.max_rows
232 # truncation determined by max_rows, actual truncated number of rows
233 # used below by min_rows
234 is_truncated_vertically = max_rows and (len(self.series) > max_rows)
235 series = self.series
236 if is_truncated_vertically:
237 max_rows = cast(int, max_rows)
238 if min_rows:
239 # if min_rows is set (not None or 0), set max_rows to minimum
240 # of both
241 max_rows = min(min_rows, max_rows)
242 if max_rows == 1:
243 row_num = max_rows
244 series = series.iloc[:max_rows]
245 else:
246 row_num = max_rows // 2
247 series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
248 self.tr_row_num = row_num
249 else:
250 self.tr_row_num = None
251 self.tr_series = series
252 self.is_truncated_vertically = is_truncated_vertically
253
254 def _get_footer(self) -> str:
255 name = self.series.name
256 footer = ""
257
258 index = self.series.index
259 if (
260 isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex))
261 and index.freq is not None
262 ):
263 footer += f"Freq: {index.freqstr}"
264
265 if self.name is not False and name is not None:
266 if footer:
267 footer += ", "
268
269 series_name = printing.pprint_thing(name, escape_chars=("\t", "\r", "\n"))
270 footer += f"Name: {series_name}"
271
272 if self.length is True or (
273 self.length == "truncate" and self.is_truncated_vertically
274 ):
275 if footer:
276 footer += ", "
277 footer += f"Length: {len(self.series)}"
278
279 if self.dtype is not False and self.dtype is not None:
280 dtype_name = getattr(self.tr_series.dtype, "name", None)
281 if dtype_name:
282 if footer:
283 footer += ", "
284 footer += f"dtype: {printing.pprint_thing(dtype_name)}"
285
286 # level infos are added to the end and in a new line, like it is done
287 # for Categoricals
288 if isinstance(self.tr_series.dtype, CategoricalDtype):
289 level_info = self.tr_series._values._get_repr_footer()
290 if footer:
291 footer += "\n"
292 footer += level_info
293
294 return str(footer)
295
296 def _get_formatted_values(self) -> list[str]:
297 return format_array(
298 self.tr_series._values,
299 None,
300 float_format=self.float_format,
301 na_rep=self.na_rep,
302 leading_space=self.index,
303 )
304
305 def to_string(self) -> str:
306 series = self.tr_series
307 footer = self._get_footer()
308
309 if len(series) == 0:
310 return f"{type(self.series).__name__}([], {footer})"
311
312 index = series.index
313 have_header = _has_names(index)
314 if isinstance(index, MultiIndex):
315 fmt_index = index._format_multi(include_names=True, sparsify=None)
316 adj = printing.get_adjustment()
317 fmt_index = adj.adjoin(2, *fmt_index).split("\n")
318 else:
319 fmt_index = index._format_flat(include_name=True)
320 fmt_values = self._get_formatted_values()
321
322 if self.is_truncated_vertically:
323 n_header_rows = 0
324 row_num = self.tr_row_num
325 row_num = cast(int, row_num)
326 width = self.adj.len(fmt_values[row_num - 1])
327 if width > 3:
328 dot_str = "..."
329 else:
330 dot_str = ".."
331 # Series uses mode=center because it has single value columns
332 # DataFrame uses mode=left
333 dot_str = self.adj.justify([dot_str], width, mode="center")[0]
334 fmt_values.insert(row_num + n_header_rows, dot_str)
335 fmt_index.insert(row_num + 1, "")
336
337 if self.index:
338 result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
339 else:
340 result = self.adj.adjoin(3, fmt_values)
341
342 if self.header and have_header:
343 result = fmt_index[0] + "\n" + result
344
345 if footer:
346 result += "\n" + footer
347
348 return str("".join(result))
349
350
351def get_dataframe_repr_params() -> dict[str, Any]:
352 """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string.
353
354 Supplying these parameters to DataFrame.to_string is equivalent to calling
355 ``repr(DataFrame)``. This is useful if you want to adjust the repr output.
356
357 .. versionadded:: 1.4.0
358
359 Example
360 -------
361 >>> import pandas as pd
362 >>>
363 >>> df = pd.DataFrame([[1, 2], [3, 4]])
364 >>> repr_params = pd.io.formats.format.get_dataframe_repr_params()
365 >>> repr(df) == df.to_string(**repr_params)
366 True
367 """
368 from pandas.io.formats import console
369
370 if get_option("display.expand_frame_repr"):
371 line_width, _ = console.get_console_size()
372 else:
373 line_width = None
374 return {
375 "max_rows": get_option("display.max_rows"),
376 "min_rows": get_option("display.min_rows"),
377 "max_cols": get_option("display.max_columns"),
378 "max_colwidth": get_option("display.max_colwidth"),
379 "show_dimensions": get_option("display.show_dimensions"),
380 "line_width": line_width,
381 }
382
383
384def get_series_repr_params() -> dict[str, Any]:
385 """Get the parameters used to repr(Series) calls using Series.to_string.
386
387 Supplying these parameters to Series.to_string is equivalent to calling
388 ``repr(series)``. This is useful if you want to adjust the series repr output.
389
390 .. versionadded:: 1.4.0
391
392 Example
393 -------
394 >>> import pandas as pd
395 >>>
396 >>> ser = pd.Series([1, 2, 3, 4])
397 >>> repr_params = pd.io.formats.format.get_series_repr_params()
398 >>> repr(ser) == ser.to_string(**repr_params)
399 True
400 """
401 width, height = get_terminal_size()
402 max_rows_opt = get_option("display.max_rows")
403 max_rows = height if max_rows_opt == 0 else max_rows_opt
404 min_rows = height if max_rows_opt == 0 else get_option("display.min_rows")
405
406 return {
407 "name": True,
408 "dtype": True,
409 "min_rows": min_rows,
410 "max_rows": max_rows,
411 "length": get_option("display.show_dimensions"),
412 }
413
414
415class DataFrameFormatter:
416 """
417 Class for processing dataframe formatting options and data.
418
419 Used by DataFrame.to_string, which backs DataFrame.__repr__.
420 """
421
422 __doc__ = __doc__ if __doc__ else ""
423 __doc__ += common_docstring + return_docstring
424
425 def __init__(
426 self,
427 frame: DataFrame,
428 columns: Axes | None = None,
429 col_space: ColspaceArgType | None = None,
430 header: bool | SequenceNotStr[str] = True,
431 index: bool = True,
432 na_rep: str = "NaN",
433 formatters: FormattersType | None = None,
434 justify: str | None = None,
435 float_format: FloatFormatType | None = None,
436 sparsify: bool | None = None,
437 index_names: bool = True,
438 max_rows: int | None = None,
439 min_rows: int | None = None,
440 max_cols: int | None = None,
441 show_dimensions: bool | str = False,
442 decimal: str = ".",
443 bold_rows: bool = False,
444 escape: bool = True,
445 ) -> None:
446 self.frame = frame
447 self.columns = self._initialize_columns(columns)
448 self.col_space = self._initialize_colspace(col_space)
449 self.header = header
450 self.index = index
451 self.na_rep = na_rep
452 self.formatters = self._initialize_formatters(formatters)
453 self.justify = self._initialize_justify(justify)
454 self.float_format = float_format
455 self.sparsify = self._initialize_sparsify(sparsify)
456 self.show_index_names = index_names
457 self.decimal = decimal
458 self.bold_rows = bold_rows
459 self.escape = escape
460 self.max_rows = max_rows
461 self.min_rows = min_rows
462 self.max_cols = max_cols
463 self.show_dimensions = show_dimensions
464
465 self.max_cols_fitted = self._calc_max_cols_fitted()
466 self.max_rows_fitted = self._calc_max_rows_fitted()
467
468 self.tr_frame = self.frame
469 self.truncate()
470 self.adj = printing.get_adjustment()
471
472 def get_strcols(self) -> list[list[str]]:
473 """
474 Render a DataFrame to a list of columns (as lists of strings).
475 """
476 strcols = self._get_strcols_without_index()
477
478 if self.index:
479 str_index = self._get_formatted_index(self.tr_frame)
480 strcols.insert(0, str_index)
481
482 return strcols
483
484 @property
485 def should_show_dimensions(self) -> bool:
486 return self.show_dimensions is True or (
487 self.show_dimensions == "truncate" and self.is_truncated
488 )
489
490 @property
491 def is_truncated(self) -> bool:
492 return bool(self.is_truncated_horizontally or self.is_truncated_vertically)
493
494 @property
495 def is_truncated_horizontally(self) -> bool:
496 return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted))
497
498 @property
499 def is_truncated_vertically(self) -> bool:
500 return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted))
501
502 @property
503 def dimensions_info(self) -> str:
504 return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]"
505
506 @property
507 def has_index_names(self) -> bool:
508 return _has_names(self.frame.index)
509
510 @property
511 def has_column_names(self) -> bool:
512 return _has_names(self.frame.columns)
513
514 @property
515 def show_row_idx_names(self) -> bool:
516 return all((self.has_index_names, self.index, self.show_index_names))
517
518 @property
519 def show_col_idx_names(self) -> bool:
520 return all((self.has_column_names, self.show_index_names, self.header))
521
522 @property
523 def max_rows_displayed(self) -> int:
524 return min(self.max_rows or len(self.frame), len(self.frame))
525
526 def _initialize_sparsify(self, sparsify: bool | None) -> bool:
527 if sparsify is None:
528 return get_option("display.multi_sparse")
529 return sparsify
530
531 def _initialize_formatters(
532 self, formatters: FormattersType | None
533 ) -> FormattersType:
534 if formatters is None:
535 return {}
536 elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict):
537 return formatters
538 else:
539 raise ValueError(
540 f"Formatters length({len(formatters)}) should match "
541 f"DataFrame number of columns({len(self.frame.columns)})"
542 )
543
544 def _initialize_justify(self, justify: str | None) -> str:
545 if justify is None:
546 return get_option("display.colheader_justify")
547 else:
548 return justify
549
550 def _initialize_columns(self, columns: Axes | None) -> Index:
551 if columns is not None:
552 cols = ensure_index(columns)
553 self.frame = self.frame[cols]
554 return cols
555 else:
556 return self.frame.columns
557
558 def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceType:
559 result: ColspaceType
560
561 if col_space is None:
562 result = {}
563 elif isinstance(col_space, (int, str)):
564 result = {"": col_space}
565 result.update({column: col_space for column in self.frame.columns})
566 elif isinstance(col_space, Mapping):
567 for column in col_space.keys():
568 if column not in self.frame.columns and column != "":
569 raise ValueError(
570 f"Col_space is defined for an unknown column: {column}"
571 )
572 result = col_space
573 else:
574 if len(self.frame.columns) != len(col_space):
575 raise ValueError(
576 f"Col_space length({len(col_space)}) should match "
577 f"DataFrame number of columns({len(self.frame.columns)})"
578 )
579 result = dict(zip(self.frame.columns, col_space))
580 return result
581
582 def _calc_max_cols_fitted(self) -> int | None:
583 """Number of columns fitting the screen."""
584 if not self._is_in_terminal():
585 return self.max_cols
586
587 width, _ = get_terminal_size()
588 if self._is_screen_narrow(width):
589 return width
590 else:
591 return self.max_cols
592
593 def _calc_max_rows_fitted(self) -> int | None:
594 """Number of rows with data fitting the screen."""
595 max_rows: int | None
596
597 if self._is_in_terminal():
598 _, height = get_terminal_size()
599 if self.max_rows == 0:
600 # rows available to fill with actual data
601 return height - self._get_number_of_auxiliary_rows()
602
603 if self._is_screen_short(height):
604 max_rows = height
605 else:
606 max_rows = self.max_rows
607 else:
608 max_rows = self.max_rows
609
610 return self._adjust_max_rows(max_rows)
611
612 def _adjust_max_rows(self, max_rows: int | None) -> int | None:
613 """Adjust max_rows using display logic.
614
615 See description here:
616 https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options
617
618 GH #37359
619 """
620 if max_rows:
621 if (len(self.frame) > max_rows) and self.min_rows:
622 # if truncated, set max_rows showed to min_rows
623 max_rows = min(self.min_rows, max_rows)
624 return max_rows
625
626 def _is_in_terminal(self) -> bool:
627 """Check if the output is to be shown in terminal."""
628 return bool(self.max_cols == 0 or self.max_rows == 0)
629
630 def _is_screen_narrow(self, max_width) -> bool:
631 return bool(self.max_cols == 0 and len(self.frame.columns) > max_width)
632
633 def _is_screen_short(self, max_height) -> bool:
634 return bool(self.max_rows == 0 and len(self.frame) > max_height)
635
636 def _get_number_of_auxiliary_rows(self) -> int:
637 """Get number of rows occupied by prompt, dots and dimension info."""
638 dot_row = 1
639 prompt_row = 1
640 num_rows = dot_row + prompt_row
641
642 if self.show_dimensions:
643 num_rows += len(self.dimensions_info.splitlines())
644
645 if self.header:
646 num_rows += 1
647
648 return num_rows
649
650 def truncate(self) -> None:
651 """
652 Check whether the frame should be truncated. If so, slice the frame up.
653 """
654 if self.is_truncated_horizontally:
655 self._truncate_horizontally()
656
657 if self.is_truncated_vertically:
658 self._truncate_vertically()
659
660 def _truncate_horizontally(self) -> None:
661 """Remove columns, which are not to be displayed and adjust formatters.
662
663 Attributes affected:
664 - tr_frame
665 - formatters
666 - tr_col_num
667 """
668 assert self.max_cols_fitted is not None
669 col_num = self.max_cols_fitted // 2
670 if col_num >= 1:
671 left = self.tr_frame.iloc[:, :col_num]
672 right = self.tr_frame.iloc[:, -col_num:]
673 self.tr_frame = concat((left, right), axis=1)
674
675 # truncate formatter
676 if isinstance(self.formatters, (list, tuple)):
677 self.formatters = [
678 *self.formatters[:col_num],
679 *self.formatters[-col_num:],
680 ]
681 else:
682 col_num = cast(int, self.max_cols)
683 self.tr_frame = self.tr_frame.iloc[:, :col_num]
684 self.tr_col_num = col_num
685
686 def _truncate_vertically(self) -> None:
687 """Remove rows, which are not to be displayed.
688
689 Attributes affected:
690 - tr_frame
691 - tr_row_num
692 """
693 assert self.max_rows_fitted is not None
694 row_num = self.max_rows_fitted // 2
695 if row_num >= 1:
696 _len = len(self.tr_frame)
697 _slice = np.hstack([np.arange(row_num), np.arange(_len - row_num, _len)])
698 self.tr_frame = self.tr_frame.iloc[_slice]
699 else:
700 row_num = cast(int, self.max_rows)
701 self.tr_frame = self.tr_frame.iloc[:row_num, :]
702 self.tr_row_num = row_num
703
704 def _get_strcols_without_index(self) -> list[list[str]]:
705 strcols: list[list[str]] = []
706
707 if not is_list_like(self.header) and not self.header:
708 for i, c in enumerate(self.tr_frame):
709 fmt_values = self.format_col(i)
710 fmt_values = _make_fixed_width(
711 strings=fmt_values,
712 justify=self.justify,
713 minimum=int(self.col_space.get(c, 0)),
714 adj=self.adj,
715 )
716 strcols.append(fmt_values)
717 return strcols
718
719 if is_list_like(self.header):
720 # cast here since can't be bool if is_list_like
721 self.header = cast(list[str], self.header)
722 if len(self.header) != len(self.columns):
723 raise ValueError(
724 f"Writing {len(self.columns)} cols "
725 f"but got {len(self.header)} aliases"
726 )
727 str_columns = [[label] for label in self.header]
728 else:
729 str_columns = self._get_formatted_column_labels(self.tr_frame)
730
731 if self.show_row_idx_names:
732 for x in str_columns:
733 x.append("")
734
735 for i, c in enumerate(self.tr_frame):
736 cheader = str_columns[i]
737 header_colwidth = max(
738 int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
739 )
740 fmt_values = self.format_col(i)
741 fmt_values = _make_fixed_width(
742 fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
743 )
744
745 max_len = max(*(self.adj.len(x) for x in fmt_values), header_colwidth)
746 cheader = self.adj.justify(cheader, max_len, mode=self.justify)
747 strcols.append(cheader + fmt_values)
748
749 return strcols
750
751 def format_col(self, i: int) -> list[str]:
752 frame = self.tr_frame
753 formatter = self._get_formatter(i)
754 return format_array(
755 frame.iloc[:, i]._values,
756 formatter,
757 float_format=self.float_format,
758 na_rep=self.na_rep,
759 space=self.col_space.get(frame.columns[i]),
760 decimal=self.decimal,
761 leading_space=self.index,
762 )
763
764 def _get_formatter(self, i: str | int) -> Callable | None:
765 if isinstance(self.formatters, (list, tuple)):
766 if is_integer(i):
767 i = cast(int, i)
768 return self.formatters[i]
769 else:
770 return None
771 else:
772 if is_integer(i) and i not in self.columns:
773 i = self.columns[i]
774 return self.formatters.get(i, None)
775
776 def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]:
777 from pandas.core.indexes.multi import sparsify_labels
778
779 columns = frame.columns
780
781 if isinstance(columns, MultiIndex):
782 fmt_columns = columns._format_multi(sparsify=False, include_names=False)
783 fmt_columns = list(zip(*fmt_columns))
784 dtypes = self.frame.dtypes._values
785
786 # if we have a Float level, they don't use leading space at all
787 restrict_formatting = any(level.is_floating for level in columns.levels)
788 need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
789
790 def space_format(x, y):
791 if (
792 y not in self.formatters
793 and need_leadsp[x]
794 and not restrict_formatting
795 ):
796 return " " + y
797 return y
798
799 str_columns_tuple = list(
800 zip(*([space_format(x, y) for y in x] for x in fmt_columns))
801 )
802 if self.sparsify and len(str_columns_tuple):
803 str_columns_tuple = sparsify_labels(str_columns_tuple)
804
805 str_columns = [list(x) for x in zip(*str_columns_tuple)]
806 else:
807 fmt_columns = columns._format_flat(include_name=False)
808 dtypes = self.frame.dtypes
809 need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
810 str_columns = [
811 [" " + x if not self._get_formatter(i) and need_leadsp[x] else x]
812 for i, x in enumerate(fmt_columns)
813 ]
814 # self.str_columns = str_columns
815 return str_columns
816
817 def _get_formatted_index(self, frame: DataFrame) -> list[str]:
818 # Note: this is only used by to_string() and to_latex(), not by
819 # to_html(). so safe to cast col_space here.
820 col_space = {k: cast(int, v) for k, v in self.col_space.items()}
821 index = frame.index
822 columns = frame.columns
823 fmt = self._get_formatter("__index__")
824
825 if isinstance(index, MultiIndex):
826 fmt_index = index._format_multi(
827 sparsify=self.sparsify,
828 include_names=self.show_row_idx_names,
829 formatter=fmt,
830 )
831 else:
832 fmt_index = [
833 index._format_flat(include_name=self.show_row_idx_names, formatter=fmt)
834 ]
835
836 fmt_index = [
837 tuple(
838 _make_fixed_width(
839 list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj
840 )
841 )
842 for x in fmt_index
843 ]
844
845 adjoined = self.adj.adjoin(1, *fmt_index).split("\n")
846
847 # empty space for columns
848 if self.show_col_idx_names:
849 col_header = [str(x) for x in self._get_column_name_list()]
850 else:
851 col_header = [""] * columns.nlevels
852
853 if self.header:
854 return col_header + adjoined
855 else:
856 return adjoined
857
858 def _get_column_name_list(self) -> list[Hashable]:
859 names: list[Hashable] = []
860 columns = self.frame.columns
861 if isinstance(columns, MultiIndex):
862 names.extend("" if name is None else name for name in columns.names)
863 else:
864 names.append("" if columns.name is None else columns.name)
865 return names
866
867
868class DataFrameRenderer:
869 """Class for creating dataframe output in multiple formats.
870
871 Called in pandas.core.generic.NDFrame:
872 - to_csv
873 - to_latex
874
875 Called in pandas.core.frame.DataFrame:
876 - to_html
877 - to_string
878
879 Parameters
880 ----------
881 fmt : DataFrameFormatter
882 Formatter with the formatting options.
883 """
884
885 def __init__(self, fmt: DataFrameFormatter) -> None:
886 self.fmt = fmt
887
888 def to_html(
889 self,
890 buf: FilePath | WriteBuffer[str] | None = None,
891 encoding: str | None = None,
892 classes: str | list | tuple | None = None,
893 notebook: bool = False,
894 border: int | bool | None = None,
895 table_id: str | None = None,
896 render_links: bool = False,
897 ) -> str | None:
898 """
899 Render a DataFrame to a html table.
900
901 Parameters
902 ----------
903 buf : str, path object, file-like object, or None, default None
904 String, path object (implementing ``os.PathLike[str]``), or file-like
905 object implementing a string ``write()`` function. If None, the result is
906 returned as a string.
907 encoding : str, default “utf-8”
908 Set character encoding.
909 classes : str or list-like
910 classes to include in the `class` attribute of the opening
911 ``<table>`` tag, in addition to the default "dataframe".
912 notebook : {True, False}, optional, default False
913 Whether the generated HTML is for IPython Notebook.
914 border : int
915 A ``border=border`` attribute is included in the opening
916 ``<table>`` tag. Default ``pd.options.display.html.border``.
917 table_id : str, optional
918 A css id is included in the opening `<table>` tag if specified.
919 render_links : bool, default False
920 Convert URLs to HTML links.
921 """
922 from pandas.io.formats.html import (
923 HTMLFormatter,
924 NotebookFormatter,
925 )
926
927 Klass = NotebookFormatter if notebook else HTMLFormatter
928
929 html_formatter = Klass(
930 self.fmt,
931 classes=classes,
932 border=border,
933 table_id=table_id,
934 render_links=render_links,
935 )
936 string = html_formatter.to_string()
937 return save_to_buffer(string, buf=buf, encoding=encoding)
938
939 def to_string(
940 self,
941 buf: FilePath | WriteBuffer[str] | None = None,
942 encoding: str | None = None,
943 line_width: int | None = None,
944 ) -> str | None:
945 """
946 Render a DataFrame to a console-friendly tabular output.
947
948 Parameters
949 ----------
950 buf : str, path object, file-like object, or None, default None
951 String, path object (implementing ``os.PathLike[str]``), or file-like
952 object implementing a string ``write()`` function. If None, the result is
953 returned as a string.
954 encoding: str, default “utf-8”
955 Set character encoding.
956 line_width : int, optional
957 Width to wrap a line in characters.
958 """
959 from pandas.io.formats.string import StringFormatter
960
961 string_formatter = StringFormatter(self.fmt, line_width=line_width)
962 string = string_formatter.to_string()
963 return save_to_buffer(string, buf=buf, encoding=encoding)
964
965 def to_csv(
966 self,
967 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
968 encoding: str | None = None,
969 sep: str = ",",
970 columns: Sequence[Hashable] | None = None,
971 index_label: IndexLabel | None = None,
972 mode: str = "w",
973 compression: CompressionOptions = "infer",
974 quoting: int | None = None,
975 quotechar: str = '"',
976 lineterminator: str | None = None,
977 chunksize: int | None = None,
978 date_format: str | None = None,
979 doublequote: bool = True,
980 escapechar: str | None = None,
981 errors: str = "strict",
982 storage_options: StorageOptions | None = None,
983 ) -> str | None:
984 """
985 Render dataframe as comma-separated file.
986 """
987 from pandas.io.formats.csvs import CSVFormatter
988
989 if path_or_buf is None:
990 created_buffer = True
991 path_or_buf = StringIO()
992 else:
993 created_buffer = False
994
995 csv_formatter = CSVFormatter(
996 path_or_buf=path_or_buf,
997 lineterminator=lineterminator,
998 sep=sep,
999 encoding=encoding,
1000 errors=errors,
1001 compression=compression,
1002 quoting=quoting,
1003 cols=columns,
1004 index_label=index_label,
1005 mode=mode,
1006 chunksize=chunksize,
1007 quotechar=quotechar,
1008 date_format=date_format,
1009 doublequote=doublequote,
1010 escapechar=escapechar,
1011 storage_options=storage_options,
1012 formatter=self.fmt,
1013 )
1014 csv_formatter.save()
1015
1016 if created_buffer:
1017 assert isinstance(path_or_buf, StringIO)
1018 content = path_or_buf.getvalue()
1019 path_or_buf.close()
1020 return content
1021
1022 return None
1023
1024
1025def save_to_buffer(
1026 string: str,
1027 buf: FilePath | WriteBuffer[str] | None = None,
1028 encoding: str | None = None,
1029) -> str | None:
1030 """
1031 Perform serialization. Write to buf or return as string if buf is None.
1032 """
1033 with _get_buffer(buf, encoding=encoding) as fd:
1034 fd.write(string)
1035 if buf is None:
1036 # error: "WriteBuffer[str]" has no attribute "getvalue"
1037 return fd.getvalue() # type: ignore[attr-defined]
1038 return None
1039
1040
1041@contextmanager
1042def _get_buffer(
1043 buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None
1044) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]:
1045 """
1046 Context manager to open, yield and close buffer for filenames or Path-like
1047 objects, otherwise yield buf unchanged.
1048 """
1049 if buf is not None:
1050 buf = stringify_path(buf)
1051 else:
1052 buf = StringIO()
1053
1054 if encoding is None:
1055 encoding = "utf-8"
1056 elif not isinstance(buf, str):
1057 raise ValueError("buf is not a file name and encoding is specified.")
1058
1059 if hasattr(buf, "write"):
1060 # Incompatible types in "yield" (actual type "Union[str, WriteBuffer[str],
1061 # StringIO]", expected type "Union[WriteBuffer[str], StringIO]")
1062 yield buf # type: ignore[misc]
1063 elif isinstance(buf, str):
1064 check_parent_directory(str(buf))
1065 with open(buf, "w", encoding=encoding, newline="") as f:
1066 # GH#30034 open instead of codecs.open prevents a file leak
1067 # if we have an invalid encoding argument.
1068 # newline="" is needed to roundtrip correctly on
1069 # windows test_to_latex_filename
1070 yield f
1071 else:
1072 raise TypeError("buf is not a file name and it has no write method")
1073
1074
1075# ----------------------------------------------------------------------
1076# Array formatters
1077
1078
1079def format_array(
1080 values: ArrayLike,
1081 formatter: Callable | None,
1082 float_format: FloatFormatType | None = None,
1083 na_rep: str = "NaN",
1084 digits: int | None = None,
1085 space: str | int | None = None,
1086 justify: str = "right",
1087 decimal: str = ".",
1088 leading_space: bool | None = True,
1089 quoting: int | None = None,
1090 fallback_formatter: Callable | None = None,
1091) -> list[str]:
1092 """
1093 Format an array for printing.
1094
1095 Parameters
1096 ----------
1097 values : np.ndarray or ExtensionArray
1098 formatter
1099 float_format
1100 na_rep
1101 digits
1102 space
1103 justify
1104 decimal
1105 leading_space : bool, optional, default True
1106 Whether the array should be formatted with a leading space.
1107 When an array as a column of a Series or DataFrame, we do want
1108 the leading space to pad between columns.
1109
1110 When formatting an Index subclass
1111 (e.g. IntervalIndex._get_values_for_csv), we don't want the
1112 leading space since it should be left-aligned.
1113 fallback_formatter
1114
1115 Returns
1116 -------
1117 List[str]
1118 """
1119 fmt_klass: type[_GenericArrayFormatter]
1120 if lib.is_np_dtype(values.dtype, "M"):
1121 fmt_klass = _Datetime64Formatter
1122 values = cast(DatetimeArray, values)
1123 elif isinstance(values.dtype, DatetimeTZDtype):
1124 fmt_klass = _Datetime64TZFormatter
1125 values = cast(DatetimeArray, values)
1126 elif lib.is_np_dtype(values.dtype, "m"):
1127 fmt_klass = _Timedelta64Formatter
1128 values = cast(TimedeltaArray, values)
1129 elif isinstance(values.dtype, ExtensionDtype):
1130 fmt_klass = _ExtensionArrayFormatter
1131 elif lib.is_np_dtype(values.dtype, "fc"):
1132 fmt_klass = FloatArrayFormatter
1133 elif lib.is_np_dtype(values.dtype, "iu"):
1134 fmt_klass = _IntArrayFormatter
1135 else:
1136 fmt_klass = _GenericArrayFormatter
1137
1138 if space is None:
1139 space = 12
1140
1141 if float_format is None:
1142 float_format = get_option("display.float_format")
1143
1144 if digits is None:
1145 digits = get_option("display.precision")
1146
1147 fmt_obj = fmt_klass(
1148 values,
1149 digits=digits,
1150 na_rep=na_rep,
1151 float_format=float_format,
1152 formatter=formatter,
1153 space=space,
1154 justify=justify,
1155 decimal=decimal,
1156 leading_space=leading_space,
1157 quoting=quoting,
1158 fallback_formatter=fallback_formatter,
1159 )
1160
1161 return fmt_obj.get_result()
1162
1163
1164class _GenericArrayFormatter:
1165 def __init__(
1166 self,
1167 values: ArrayLike,
1168 digits: int = 7,
1169 formatter: Callable | None = None,
1170 na_rep: str = "NaN",
1171 space: str | int = 12,
1172 float_format: FloatFormatType | None = None,
1173 justify: str = "right",
1174 decimal: str = ".",
1175 quoting: int | None = None,
1176 fixed_width: bool = True,
1177 leading_space: bool | None = True,
1178 fallback_formatter: Callable | None = None,
1179 ) -> None:
1180 self.values = values
1181 self.digits = digits
1182 self.na_rep = na_rep
1183 self.space = space
1184 self.formatter = formatter
1185 self.float_format = float_format
1186 self.justify = justify
1187 self.decimal = decimal
1188 self.quoting = quoting
1189 self.fixed_width = fixed_width
1190 self.leading_space = leading_space
1191 self.fallback_formatter = fallback_formatter
1192
1193 def get_result(self) -> list[str]:
1194 fmt_values = self._format_strings()
1195 return _make_fixed_width(fmt_values, self.justify)
1196
1197 def _format_strings(self) -> list[str]:
1198 if self.float_format is None:
1199 float_format = get_option("display.float_format")
1200 if float_format is None:
1201 precision = get_option("display.precision")
1202 float_format = lambda x: _trim_zeros_single_float(
1203 f"{x: .{precision:d}f}"
1204 )
1205 else:
1206 float_format = self.float_format
1207
1208 if self.formatter is not None:
1209 formatter = self.formatter
1210 elif self.fallback_formatter is not None:
1211 formatter = self.fallback_formatter
1212 else:
1213 quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
1214 formatter = partial(
1215 printing.pprint_thing,
1216 escape_chars=("\t", "\r", "\n"),
1217 quote_strings=quote_strings,
1218 )
1219
1220 def _format(x):
1221 if self.na_rep is not None and is_scalar(x) and isna(x):
1222 if x is None:
1223 return "None"
1224 elif x is NA:
1225 return str(NA)
1226 elif lib.is_float(x) and np.isinf(x):
1227 # TODO(3.0): this will be unreachable when use_inf_as_na
1228 # deprecation is enforced
1229 return str(x)
1230 elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)):
1231 return "NaT"
1232 return self.na_rep
1233 elif isinstance(x, PandasObject):
1234 return str(x)
1235 elif isinstance(x, StringDtype):
1236 return repr(x)
1237 else:
1238 # object dtype
1239 return str(formatter(x))
1240
1241 vals = self.values
1242 if not isinstance(vals, np.ndarray):
1243 raise TypeError(
1244 "ExtensionArray formatting should use _ExtensionArrayFormatter"
1245 )
1246 inferred = lib.map_infer(vals, is_float)
1247 is_float_type = (
1248 inferred
1249 # vals may have 2 or more dimensions
1250 & np.all(notna(vals), axis=tuple(range(1, len(vals.shape))))
1251 )
1252 leading_space = self.leading_space
1253 if leading_space is None:
1254 leading_space = is_float_type.any()
1255
1256 fmt_values = []
1257 for i, v in enumerate(vals):
1258 if (not is_float_type[i] or self.formatter is not None) and leading_space:
1259 fmt_values.append(f" {_format(v)}")
1260 elif is_float_type[i]:
1261 fmt_values.append(float_format(v))
1262 else:
1263 if leading_space is False:
1264 # False specifically, so that the default is
1265 # to include a space if we get here.
1266 tpl = "{v}"
1267 else:
1268 tpl = " {v}"
1269 fmt_values.append(tpl.format(v=_format(v)))
1270
1271 return fmt_values
1272
1273
1274class FloatArrayFormatter(_GenericArrayFormatter):
1275 def __init__(self, *args, **kwargs) -> None:
1276 super().__init__(*args, **kwargs)
1277
1278 # float_format is expected to be a string
1279 # formatter should be used to pass a function
1280 if self.float_format is not None and self.formatter is None:
1281 # GH21625, GH22270
1282 self.fixed_width = False
1283 if callable(self.float_format):
1284 self.formatter = self.float_format
1285 self.float_format = None
1286
1287 def _value_formatter(
1288 self,
1289 float_format: FloatFormatType | None = None,
1290 threshold: float | None = None,
1291 ) -> Callable:
1292 """Returns a function to be applied on each value to format it"""
1293 # the float_format parameter supersedes self.float_format
1294 if float_format is None:
1295 float_format = self.float_format
1296
1297 # we are going to compose different functions, to first convert to
1298 # a string, then replace the decimal symbol, and finally chop according
1299 # to the threshold
1300
1301 # when there is no float_format, we use str instead of '%g'
1302 # because str(0.0) = '0.0' while '%g' % 0.0 = '0'
1303 if float_format:
1304
1305 def base_formatter(v):
1306 assert float_format is not None # for mypy
1307 # error: "str" not callable
1308 # error: Unexpected keyword argument "value" for "__call__" of
1309 # "EngFormatter"
1310 return (
1311 float_format(value=v) # type: ignore[operator,call-arg]
1312 if notna(v)
1313 else self.na_rep
1314 )
1315
1316 else:
1317
1318 def base_formatter(v):
1319 return str(v) if notna(v) else self.na_rep
1320
1321 if self.decimal != ".":
1322
1323 def decimal_formatter(v):
1324 return base_formatter(v).replace(".", self.decimal, 1)
1325
1326 else:
1327 decimal_formatter = base_formatter
1328
1329 if threshold is None:
1330 return decimal_formatter
1331
1332 def formatter(value):
1333 if notna(value):
1334 if abs(value) > threshold:
1335 return decimal_formatter(value)
1336 else:
1337 return decimal_formatter(0.0)
1338 else:
1339 return self.na_rep
1340
1341 return formatter
1342
1343 def get_result_as_array(self) -> np.ndarray:
1344 """
1345 Returns the float values converted into strings using
1346 the parameters given at initialisation, as a numpy array
1347 """
1348
1349 def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str):
1350 mask = isna(values)
1351 formatted = np.array(
1352 [
1353 formatter(val) if not m else na_rep
1354 for val, m in zip(values.ravel(), mask.ravel())
1355 ]
1356 ).reshape(values.shape)
1357 return formatted
1358
1359 def format_complex_with_na_rep(
1360 values: ArrayLike, formatter: Callable, na_rep: str
1361 ):
1362 real_values = np.real(values).ravel() # type: ignore[arg-type]
1363 imag_values = np.imag(values).ravel() # type: ignore[arg-type]
1364 real_mask, imag_mask = isna(real_values), isna(imag_values)
1365 formatted_lst = []
1366 for val, real_val, imag_val, re_isna, im_isna in zip(
1367 values.ravel(),
1368 real_values,
1369 imag_values,
1370 real_mask,
1371 imag_mask,
1372 ):
1373 if not re_isna and not im_isna:
1374 formatted_lst.append(formatter(val))
1375 elif not re_isna: # xxx+nanj
1376 formatted_lst.append(f"{formatter(real_val)}+{na_rep}j")
1377 elif not im_isna: # nan[+/-]xxxj
1378 # The imaginary part may either start with a "-" or a space
1379 imag_formatted = formatter(imag_val).strip()
1380 if imag_formatted.startswith("-"):
1381 formatted_lst.append(f"{na_rep}{imag_formatted}j")
1382 else:
1383 formatted_lst.append(f"{na_rep}+{imag_formatted}j")
1384 else: # nan+nanj
1385 formatted_lst.append(f"{na_rep}+{na_rep}j")
1386 return np.array(formatted_lst).reshape(values.shape)
1387
1388 if self.formatter is not None:
1389 return format_with_na_rep(self.values, self.formatter, self.na_rep)
1390
1391 if self.fixed_width:
1392 threshold = get_option("display.chop_threshold")
1393 else:
1394 threshold = None
1395
1396 # if we have a fixed_width, we'll need to try different float_format
1397 def format_values_with(float_format):
1398 formatter = self._value_formatter(float_format, threshold)
1399
1400 # default formatter leaves a space to the left when formatting
1401 # floats, must be consistent for left-justifying NaNs (GH #25061)
1402 na_rep = " " + self.na_rep if self.justify == "left" else self.na_rep
1403
1404 # different formatting strategies for complex and non-complex data
1405 # need to distinguish complex and float NaNs (GH #53762)
1406 values = self.values
1407 is_complex = is_complex_dtype(values)
1408
1409 # separate the wheat from the chaff
1410 if is_complex:
1411 values = format_complex_with_na_rep(values, formatter, na_rep)
1412 else:
1413 values = format_with_na_rep(values, formatter, na_rep)
1414
1415 if self.fixed_width:
1416 if is_complex:
1417 result = _trim_zeros_complex(values, self.decimal)
1418 else:
1419 result = _trim_zeros_float(values, self.decimal)
1420 return np.asarray(result, dtype="object")
1421
1422 return values
1423
1424 # There is a special default string when we are fixed-width
1425 # The default is otherwise to use str instead of a formatting string
1426 float_format: FloatFormatType | None
1427 if self.float_format is None:
1428 if self.fixed_width:
1429 if self.leading_space is True:
1430 fmt_str = "{value: .{digits:d}f}"
1431 else:
1432 fmt_str = "{value:.{digits:d}f}"
1433 float_format = partial(fmt_str.format, digits=self.digits)
1434 else:
1435 float_format = self.float_format
1436 else:
1437 float_format = lambda value: self.float_format % value
1438
1439 formatted_values = format_values_with(float_format)
1440
1441 if not self.fixed_width:
1442 return formatted_values
1443
1444 # we need do convert to engineering format if some values are too small
1445 # and would appear as 0, or if some values are too big and take too
1446 # much space
1447
1448 if len(formatted_values) > 0:
1449 maxlen = max(len(x) for x in formatted_values)
1450 too_long = maxlen > self.digits + 6
1451 else:
1452 too_long = False
1453
1454 abs_vals = np.abs(self.values)
1455 # this is pretty arbitrary for now
1456 # large values: more that 8 characters including decimal symbol
1457 # and first digit, hence > 1e6
1458 has_large_values = (abs_vals > 1e6).any()
1459 has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
1460
1461 if has_small_values or (too_long and has_large_values):
1462 if self.leading_space is True:
1463 fmt_str = "{value: .{digits:d}e}"
1464 else:
1465 fmt_str = "{value:.{digits:d}e}"
1466 float_format = partial(fmt_str.format, digits=self.digits)
1467 formatted_values = format_values_with(float_format)
1468
1469 return formatted_values
1470
1471 def _format_strings(self) -> list[str]:
1472 return list(self.get_result_as_array())
1473
1474
1475class _IntArrayFormatter(_GenericArrayFormatter):
1476 def _format_strings(self) -> list[str]:
1477 if self.leading_space is False:
1478 formatter_str = lambda x: f"{x:d}".format(x=x)
1479 else:
1480 formatter_str = lambda x: f"{x: d}".format(x=x)
1481 formatter = self.formatter or formatter_str
1482 fmt_values = [formatter(x) for x in self.values]
1483 return fmt_values
1484
1485
1486class _Datetime64Formatter(_GenericArrayFormatter):
1487 values: DatetimeArray
1488
1489 def __init__(
1490 self,
1491 values: DatetimeArray,
1492 nat_rep: str = "NaT",
1493 date_format: None = None,
1494 **kwargs,
1495 ) -> None:
1496 super().__init__(values, **kwargs)
1497 self.nat_rep = nat_rep
1498 self.date_format = date_format
1499
1500 def _format_strings(self) -> list[str]:
1501 """we by definition have DO NOT have a TZ"""
1502 values = self.values
1503
1504 if self.formatter is not None:
1505 return [self.formatter(x) for x in values]
1506
1507 fmt_values = values._format_native_types(
1508 na_rep=self.nat_rep, date_format=self.date_format
1509 )
1510 return fmt_values.tolist()
1511
1512
1513class _ExtensionArrayFormatter(_GenericArrayFormatter):
1514 values: ExtensionArray
1515
1516 def _format_strings(self) -> list[str]:
1517 values = self.values
1518
1519 formatter = self.formatter
1520 fallback_formatter = None
1521 if formatter is None:
1522 fallback_formatter = values._formatter(boxed=True)
1523
1524 if isinstance(values, Categorical):
1525 # Categorical is special for now, so that we can preserve tzinfo
1526 array = values._internal_get_values()
1527 else:
1528 array = np.asarray(values, dtype=object)
1529
1530 fmt_values = format_array(
1531 array,
1532 formatter,
1533 float_format=self.float_format,
1534 na_rep=self.na_rep,
1535 digits=self.digits,
1536 space=self.space,
1537 justify=self.justify,
1538 decimal=self.decimal,
1539 leading_space=self.leading_space,
1540 quoting=self.quoting,
1541 fallback_formatter=fallback_formatter,
1542 )
1543 return fmt_values
1544
1545
1546def format_percentiles(
1547 percentiles: (np.ndarray | Sequence[float]),
1548) -> list[str]:
1549 """
1550 Outputs rounded and formatted percentiles.
1551
1552 Parameters
1553 ----------
1554 percentiles : list-like, containing floats from interval [0,1]
1555
1556 Returns
1557 -------
1558 formatted : list of strings
1559
1560 Notes
1561 -----
1562 Rounding precision is chosen so that: (1) if any two elements of
1563 ``percentiles`` differ, they remain different after rounding
1564 (2) no entry is *rounded* to 0% or 100%.
1565 Any non-integer is always rounded to at least 1 decimal place.
1566
1567 Examples
1568 --------
1569 Keeps all entries different after rounding:
1570
1571 >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
1572 ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
1573
1574 No element is rounded to 0% or 100% (unless already equal to it).
1575 Duplicates are allowed:
1576
1577 >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
1578 ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
1579 """
1580 percentiles = np.asarray(percentiles)
1581
1582 # It checks for np.nan as well
1583 if (
1584 not is_numeric_dtype(percentiles)
1585 or not np.all(percentiles >= 0)
1586 or not np.all(percentiles <= 1)
1587 ):
1588 raise ValueError("percentiles should all be in the interval [0,1]")
1589
1590 percentiles = 100 * percentiles
1591 prec = get_precision(percentiles)
1592 percentiles_round_type = percentiles.round(prec).astype(int)
1593
1594 int_idx = np.isclose(percentiles_round_type, percentiles)
1595
1596 if np.all(int_idx):
1597 out = percentiles_round_type.astype(str)
1598 return [i + "%" for i in out]
1599
1600 unique_pcts = np.unique(percentiles)
1601 prec = get_precision(unique_pcts)
1602 out = np.empty_like(percentiles, dtype=object)
1603 out[int_idx] = percentiles[int_idx].round().astype(int).astype(str)
1604
1605 out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
1606 return [i + "%" for i in out]
1607
1608
1609def get_precision(array: np.ndarray | Sequence[float]) -> int:
1610 to_begin = array[0] if array[0] > 0 else None
1611 to_end = 100 - array[-1] if array[-1] < 100 else None
1612 diff = np.ediff1d(array, to_begin=to_begin, to_end=to_end)
1613 diff = abs(diff)
1614 prec = -np.floor(np.log10(np.min(diff))).astype(int)
1615 prec = max(1, prec)
1616 return prec
1617
1618
1619def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
1620 if x is NaT:
1621 return nat_rep
1622
1623 # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
1624 # so it already uses string formatting rather than strftime (faster).
1625 return str(x)
1626
1627
1628def _format_datetime64_dateonly(
1629 x: NaTType | Timestamp,
1630 nat_rep: str = "NaT",
1631 date_format: str | None = None,
1632) -> str:
1633 if isinstance(x, NaTType):
1634 return nat_rep
1635
1636 if date_format:
1637 return x.strftime(date_format)
1638 else:
1639 # Timestamp._date_repr relies on string formatting (faster than strftime)
1640 return x._date_repr
1641
1642
1643def get_format_datetime64(
1644 is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None
1645) -> Callable:
1646 """Return a formatter callable taking a datetime64 as input and providing
1647 a string as output"""
1648
1649 if is_dates_only:
1650 return lambda x: _format_datetime64_dateonly(
1651 x, nat_rep=nat_rep, date_format=date_format
1652 )
1653 else:
1654 return lambda x: _format_datetime64(x, nat_rep=nat_rep)
1655
1656
1657class _Datetime64TZFormatter(_Datetime64Formatter):
1658 values: DatetimeArray
1659
1660 def _format_strings(self) -> list[str]:
1661 """we by definition have a TZ"""
1662 ido = self.values._is_dates_only
1663 values = self.values.astype(object)
1664 formatter = self.formatter or get_format_datetime64(
1665 ido, date_format=self.date_format
1666 )
1667 fmt_values = [formatter(x) for x in values]
1668
1669 return fmt_values
1670
1671
1672class _Timedelta64Formatter(_GenericArrayFormatter):
1673 values: TimedeltaArray
1674
1675 def __init__(
1676 self,
1677 values: TimedeltaArray,
1678 nat_rep: str = "NaT",
1679 **kwargs,
1680 ) -> None:
1681 # TODO: nat_rep is never passed, na_rep is.
1682 super().__init__(values, **kwargs)
1683 self.nat_rep = nat_rep
1684
1685 def _format_strings(self) -> list[str]:
1686 formatter = self.formatter or get_format_timedelta64(
1687 self.values, nat_rep=self.nat_rep, box=False
1688 )
1689 return [formatter(x) for x in self.values]
1690
1691
1692def get_format_timedelta64(
1693 values: TimedeltaArray,
1694 nat_rep: str | float = "NaT",
1695 box: bool = False,
1696) -> Callable:
1697 """
1698 Return a formatter function for a range of timedeltas.
1699 These will all have the same format argument
1700
1701 If box, then show the return in quotes
1702 """
1703 even_days = values._is_dates_only
1704
1705 if even_days:
1706 format = None
1707 else:
1708 format = "long"
1709
1710 def _formatter(x):
1711 if x is None or (is_scalar(x) and isna(x)):
1712 return nat_rep
1713
1714 if not isinstance(x, Timedelta):
1715 x = Timedelta(x)
1716
1717 # Timedelta._repr_base uses string formatting (faster than strftime)
1718 result = x._repr_base(format=format)
1719 if box:
1720 result = f"'{result}'"
1721 return result
1722
1723 return _formatter
1724
1725
1726def _make_fixed_width(
1727 strings: list[str],
1728 justify: str = "right",
1729 minimum: int | None = None,
1730 adj: printing._TextAdjustment | None = None,
1731) -> list[str]:
1732 if len(strings) == 0 or justify == "all":
1733 return strings
1734
1735 if adj is None:
1736 adjustment = printing.get_adjustment()
1737 else:
1738 adjustment = adj
1739
1740 max_len = max(adjustment.len(x) for x in strings)
1741
1742 if minimum is not None:
1743 max_len = max(minimum, max_len)
1744
1745 conf_max = get_option("display.max_colwidth")
1746 if conf_max is not None and max_len > conf_max:
1747 max_len = conf_max
1748
1749 def just(x: str) -> str:
1750 if conf_max is not None:
1751 if (conf_max > 3) & (adjustment.len(x) > max_len):
1752 x = x[: max_len - 3] + "..."
1753 return x
1754
1755 strings = [just(x) for x in strings]
1756 result = adjustment.justify(strings, max_len, mode=justify)
1757 return result
1758
1759
1760def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[str]:
1761 """
1762 Separates the real and imaginary parts from the complex number, and
1763 executes the _trim_zeros_float method on each of those.
1764 """
1765 real_part, imag_part = [], []
1766 for x in str_complexes:
1767 # Complex numbers are represented as "(-)xxx(+/-)xxxj"
1768 # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""]
1769 # Therefore, the imaginary part is the 4th and 3rd last elements,
1770 # and the real part is everything before the imaginary part
1771 trimmed = re.split(r"([j+-])", x)
1772 real_part.append("".join(trimmed[:-4]))
1773 imag_part.append("".join(trimmed[-4:-2]))
1774
1775 # We want to align the lengths of the real and imaginary parts of each complex
1776 # number, as well as the lengths the real (resp. complex) parts of all numbers
1777 # in the array
1778 n = len(str_complexes)
1779 padded_parts = _trim_zeros_float(real_part + imag_part, decimal)
1780 if len(padded_parts) == 0:
1781 return []
1782 padded_length = max(len(part) for part in padded_parts) - 1
1783 padded = [
1784 real_pt # real part, possibly NaN
1785 + imag_pt[0] # +/-
1786 + f"{imag_pt[1:]:>{padded_length}}" # complex part (no sign), possibly nan
1787 + "j"
1788 for real_pt, imag_pt in zip(padded_parts[:n], padded_parts[n:])
1789 ]
1790 return padded
1791
1792
1793def _trim_zeros_single_float(str_float: str) -> str:
1794 """
1795 Trims trailing zeros after a decimal point,
1796 leaving just one if necessary.
1797 """
1798 str_float = str_float.rstrip("0")
1799 if str_float.endswith("."):
1800 str_float += "0"
1801
1802 return str_float
1803
1804
1805def _trim_zeros_float(
1806 str_floats: ArrayLike | list[str], decimal: str = "."
1807) -> list[str]:
1808 """
1809 Trims the maximum number of trailing zeros equally from
1810 all numbers containing decimals, leaving just one if
1811 necessary.
1812 """
1813 trimmed = str_floats
1814 number_regex = re.compile(rf"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$")
1815
1816 def is_number_with_decimal(x) -> bool:
1817 return re.match(number_regex, x) is not None
1818
1819 def should_trim(values: ArrayLike | list[str]) -> bool:
1820 """
1821 Determine if an array of strings should be trimmed.
1822
1823 Returns True if all numbers containing decimals (defined by the
1824 above regular expression) within the array end in a zero, otherwise
1825 returns False.
1826 """
1827 numbers = [x for x in values if is_number_with_decimal(x)]
1828 return len(numbers) > 0 and all(x.endswith("0") for x in numbers)
1829
1830 while should_trim(trimmed):
1831 trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed]
1832
1833 # leave one 0 after the decimal points if need be.
1834 result = [
1835 x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x
1836 for x in trimmed
1837 ]
1838 return result
1839
1840
1841def _has_names(index: Index) -> bool:
1842 if isinstance(index, MultiIndex):
1843 return com.any_not_none(*index.names)
1844 else:
1845 return index.name is not None
1846
1847
1848class EngFormatter:
1849 """
1850 Formats float values according to engineering format.
1851
1852 Based on matplotlib.ticker.EngFormatter
1853 """
1854
1855 # The SI engineering prefixes
1856 ENG_PREFIXES = {
1857 -24: "y",
1858 -21: "z",
1859 -18: "a",
1860 -15: "f",
1861 -12: "p",
1862 -9: "n",
1863 -6: "u",
1864 -3: "m",
1865 0: "",
1866 3: "k",
1867 6: "M",
1868 9: "G",
1869 12: "T",
1870 15: "P",
1871 18: "E",
1872 21: "Z",
1873 24: "Y",
1874 }
1875
1876 def __init__(
1877 self, accuracy: int | None = None, use_eng_prefix: bool = False
1878 ) -> None:
1879 self.accuracy = accuracy
1880 self.use_eng_prefix = use_eng_prefix
1881
1882 def __call__(self, num: float) -> str:
1883 """
1884 Formats a number in engineering notation, appending a letter
1885 representing the power of 1000 of the original number. Some examples:
1886 >>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True)
1887 >>> format_eng(0)
1888 ' 0'
1889 >>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True)
1890 >>> format_eng(1_000_000)
1891 ' 1.0M'
1892 >>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False)
1893 >>> format_eng("-1e-6")
1894 '-1.00E-06'
1895
1896 @param num: the value to represent
1897 @type num: either a numeric value or a string that can be converted to
1898 a numeric value (as per decimal.Decimal constructor)
1899
1900 @return: engineering formatted string
1901 """
1902 dnum = Decimal(str(num))
1903
1904 if Decimal.is_nan(dnum):
1905 return "NaN"
1906
1907 if Decimal.is_infinite(dnum):
1908 return "inf"
1909
1910 sign = 1
1911
1912 if dnum < 0: # pragma: no cover
1913 sign = -1
1914 dnum = -dnum
1915
1916 if dnum != 0:
1917 pow10 = Decimal(int(math.floor(dnum.log10() / 3) * 3))
1918 else:
1919 pow10 = Decimal(0)
1920
1921 pow10 = pow10.min(max(self.ENG_PREFIXES.keys()))
1922 pow10 = pow10.max(min(self.ENG_PREFIXES.keys()))
1923 int_pow10 = int(pow10)
1924
1925 if self.use_eng_prefix:
1926 prefix = self.ENG_PREFIXES[int_pow10]
1927 elif int_pow10 < 0:
1928 prefix = f"E-{-int_pow10:02d}"
1929 else:
1930 prefix = f"E+{int_pow10:02d}"
1931
1932 mant = sign * dnum / (10**pow10)
1933
1934 if self.accuracy is None: # pragma: no cover
1935 format_str = "{mant: g}{prefix}"
1936 else:
1937 format_str = f"{{mant: .{self.accuracy:d}f}}{{prefix}}"
1938
1939 formatted = format_str.format(mant=mant, prefix=prefix)
1940
1941 return formatted
1942
1943
1944def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None:
1945 """
1946 Format float representation in DataFrame with SI notation.
1947
1948 Parameters
1949 ----------
1950 accuracy : int, default 3
1951 Number of decimal digits after the floating point.
1952 use_eng_prefix : bool, default False
1953 Whether to represent a value with SI prefixes.
1954
1955 Returns
1956 -------
1957 None
1958
1959 Examples
1960 --------
1961 >>> df = pd.DataFrame([1e-9, 1e-3, 1, 1e3, 1e6])
1962 >>> df
1963 0
1964 0 1.000000e-09
1965 1 1.000000e-03
1966 2 1.000000e+00
1967 3 1.000000e+03
1968 4 1.000000e+06
1969
1970 >>> pd.set_eng_float_format(accuracy=1)
1971 >>> df
1972 0
1973 0 1.0E-09
1974 1 1.0E-03
1975 2 1.0E+00
1976 3 1.0E+03
1977 4 1.0E+06
1978
1979 >>> pd.set_eng_float_format(use_eng_prefix=True)
1980 >>> df
1981 0
1982 0 1.000n
1983 1 1.000m
1984 2 1.000
1985 3 1.000k
1986 4 1.000M
1987
1988 >>> pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
1989 >>> df
1990 0
1991 0 1.0n
1992 1 1.0m
1993 2 1.0
1994 3 1.0k
1995 4 1.0M
1996
1997 >>> pd.set_option("display.float_format", None) # unset option
1998 """
1999 set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix))
2000
2001
2002def get_level_lengths(
2003 levels: Any, sentinel: bool | object | str = ""
2004) -> list[dict[int, int]]:
2005 """
2006 For each index in each level the function returns lengths of indexes.
2007
2008 Parameters
2009 ----------
2010 levels : list of lists
2011 List of values on for level.
2012 sentinel : string, optional
2013 Value which states that no new index starts on there.
2014
2015 Returns
2016 -------
2017 Returns list of maps. For each level returns map of indexes (key is index
2018 in row and value is length of index).
2019 """
2020 if len(levels) == 0:
2021 return []
2022
2023 control = [True] * len(levels[0])
2024
2025 result = []
2026 for level in levels:
2027 last_index = 0
2028
2029 lengths = {}
2030 for i, key in enumerate(level):
2031 if control[i] and key == sentinel:
2032 pass
2033 else:
2034 control[i] = False
2035 lengths[last_index] = i - last_index
2036 last_index = i
2037
2038 lengths[last_index] = len(level) - last_index
2039
2040 result.append(lengths)
2041
2042 return result
2043
2044
2045def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None:
2046 """
2047 Appends lines to a buffer.
2048
2049 Parameters
2050 ----------
2051 buf
2052 The buffer to write to
2053 lines
2054 The lines to append.
2055 """
2056 if any(isinstance(x, str) for x in lines):
2057 lines = [str(x) for x in lines]
2058 buf.write("\n".join(lines))