1from __future__ import annotations
2
3from abc import (
4 ABC,
5 abstractmethod,
6)
7import sys
8from textwrap import dedent
9from typing import TYPE_CHECKING
10
11from pandas._config import get_option
12
13from pandas.io.formats import format as fmt
14from pandas.io.formats.printing import pprint_thing
15
16if TYPE_CHECKING:
17 from collections.abc import (
18 Iterable,
19 Iterator,
20 Mapping,
21 Sequence,
22 )
23
24 from pandas._typing import (
25 Dtype,
26 WriteBuffer,
27 )
28
29 from pandas import (
30 DataFrame,
31 Index,
32 Series,
33 )
34
35
36frame_max_cols_sub = dedent(
37 """\
38 max_cols : int, optional
39 When to switch from the verbose to the truncated output. If the
40 DataFrame has more than `max_cols` columns, the truncated output
41 is used. By default, the setting in
42 ``pandas.options.display.max_info_columns`` is used."""
43)
44
45
46show_counts_sub = dedent(
47 """\
48 show_counts : bool, optional
49 Whether to show the non-null counts. By default, this is shown
50 only if the DataFrame is smaller than
51 ``pandas.options.display.max_info_rows`` and
52 ``pandas.options.display.max_info_columns``. A value of True always
53 shows the counts, and False never shows the counts."""
54)
55
56
57frame_examples_sub = dedent(
58 """\
59 >>> int_values = [1, 2, 3, 4, 5]
60 >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
61 >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
62 >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
63 ... "float_col": float_values})
64 >>> df
65 int_col text_col float_col
66 0 1 alpha 0.00
67 1 2 beta 0.25
68 2 3 gamma 0.50
69 3 4 delta 0.75
70 4 5 epsilon 1.00
71
72 Prints information of all columns:
73
74 >>> df.info(verbose=True)
75 <class 'pandas.core.frame.DataFrame'>
76 RangeIndex: 5 entries, 0 to 4
77 Data columns (total 3 columns):
78 # Column Non-Null Count Dtype
79 --- ------ -------------- -----
80 0 int_col 5 non-null int64
81 1 text_col 5 non-null object
82 2 float_col 5 non-null float64
83 dtypes: float64(1), int64(1), object(1)
84 memory usage: 248.0+ bytes
85
86 Prints a summary of columns count and its dtypes but not per column
87 information:
88
89 >>> df.info(verbose=False)
90 <class 'pandas.core.frame.DataFrame'>
91 RangeIndex: 5 entries, 0 to 4
92 Columns: 3 entries, int_col to float_col
93 dtypes: float64(1), int64(1), object(1)
94 memory usage: 248.0+ bytes
95
96 Pipe output of DataFrame.info to buffer instead of sys.stdout, get
97 buffer content and writes to a text file:
98
99 >>> import io
100 >>> buffer = io.StringIO()
101 >>> df.info(buf=buffer)
102 >>> s = buffer.getvalue()
103 >>> with open("df_info.txt", "w",
104 ... encoding="utf-8") as f: # doctest: +SKIP
105 ... f.write(s)
106 260
107
108 The `memory_usage` parameter allows deep introspection mode, specially
109 useful for big DataFrames and fine-tune memory optimization:
110
111 >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
112 >>> df = pd.DataFrame({
113 ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
114 ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
115 ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
116 ... })
117 >>> df.info()
118 <class 'pandas.core.frame.DataFrame'>
119 RangeIndex: 1000000 entries, 0 to 999999
120 Data columns (total 3 columns):
121 # Column Non-Null Count Dtype
122 --- ------ -------------- -----
123 0 column_1 1000000 non-null object
124 1 column_2 1000000 non-null object
125 2 column_3 1000000 non-null object
126 dtypes: object(3)
127 memory usage: 22.9+ MB
128
129 >>> df.info(memory_usage='deep')
130 <class 'pandas.core.frame.DataFrame'>
131 RangeIndex: 1000000 entries, 0 to 999999
132 Data columns (total 3 columns):
133 # Column Non-Null Count Dtype
134 --- ------ -------------- -----
135 0 column_1 1000000 non-null object
136 1 column_2 1000000 non-null object
137 2 column_3 1000000 non-null object
138 dtypes: object(3)
139 memory usage: 165.9 MB"""
140)
141
142
143frame_see_also_sub = dedent(
144 """\
145 DataFrame.describe: Generate descriptive statistics of DataFrame
146 columns.
147 DataFrame.memory_usage: Memory usage of DataFrame columns."""
148)
149
150
151frame_sub_kwargs = {
152 "klass": "DataFrame",
153 "type_sub": " and columns",
154 "max_cols_sub": frame_max_cols_sub,
155 "show_counts_sub": show_counts_sub,
156 "examples_sub": frame_examples_sub,
157 "see_also_sub": frame_see_also_sub,
158 "version_added_sub": "",
159}
160
161
162series_examples_sub = dedent(
163 """\
164 >>> int_values = [1, 2, 3, 4, 5]
165 >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
166 >>> s = pd.Series(text_values, index=int_values)
167 >>> s.info()
168 <class 'pandas.core.series.Series'>
169 Index: 5 entries, 1 to 5
170 Series name: None
171 Non-Null Count Dtype
172 -------------- -----
173 5 non-null object
174 dtypes: object(1)
175 memory usage: 80.0+ bytes
176
177 Prints a summary excluding information about its values:
178
179 >>> s.info(verbose=False)
180 <class 'pandas.core.series.Series'>
181 Index: 5 entries, 1 to 5
182 dtypes: object(1)
183 memory usage: 80.0+ bytes
184
185 Pipe output of Series.info to buffer instead of sys.stdout, get
186 buffer content and writes to a text file:
187
188 >>> import io
189 >>> buffer = io.StringIO()
190 >>> s.info(buf=buffer)
191 >>> s = buffer.getvalue()
192 >>> with open("df_info.txt", "w",
193 ... encoding="utf-8") as f: # doctest: +SKIP
194 ... f.write(s)
195 260
196
197 The `memory_usage` parameter allows deep introspection mode, specially
198 useful for big Series and fine-tune memory optimization:
199
200 >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
201 >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
202 >>> s.info()
203 <class 'pandas.core.series.Series'>
204 RangeIndex: 1000000 entries, 0 to 999999
205 Series name: None
206 Non-Null Count Dtype
207 -------------- -----
208 1000000 non-null object
209 dtypes: object(1)
210 memory usage: 7.6+ MB
211
212 >>> s.info(memory_usage='deep')
213 <class 'pandas.core.series.Series'>
214 RangeIndex: 1000000 entries, 0 to 999999
215 Series name: None
216 Non-Null Count Dtype
217 -------------- -----
218 1000000 non-null object
219 dtypes: object(1)
220 memory usage: 55.3 MB"""
221)
222
223
224series_see_also_sub = dedent(
225 """\
226 Series.describe: Generate descriptive statistics of Series.
227 Series.memory_usage: Memory usage of Series."""
228)
229
230
231series_sub_kwargs = {
232 "klass": "Series",
233 "type_sub": "",
234 "max_cols_sub": "",
235 "show_counts_sub": show_counts_sub,
236 "examples_sub": series_examples_sub,
237 "see_also_sub": series_see_also_sub,
238 "version_added_sub": "\n.. versionadded:: 1.4.0\n",
239}
240
241
242INFO_DOCSTRING = dedent(
243 """
244 Print a concise summary of a {klass}.
245
246 This method prints information about a {klass} including
247 the index dtype{type_sub}, non-null values and memory usage.
248 {version_added_sub}\
249
250 Parameters
251 ----------
252 verbose : bool, optional
253 Whether to print the full summary. By default, the setting in
254 ``pandas.options.display.max_info_columns`` is followed.
255 buf : writable buffer, defaults to sys.stdout
256 Where to send the output. By default, the output is printed to
257 sys.stdout. Pass a writable buffer if you need to further process
258 the output.
259 {max_cols_sub}
260 memory_usage : bool, str, optional
261 Specifies whether total memory usage of the {klass}
262 elements (including the index) should be displayed. By default,
263 this follows the ``pandas.options.display.memory_usage`` setting.
264
265 True always show memory usage. False never shows memory usage.
266 A value of 'deep' is equivalent to "True with deep introspection".
267 Memory usage is shown in human-readable units (base-2
268 representation). Without deep introspection a memory estimation is
269 made based in column dtype and number of rows assuming values
270 consume the same memory amount for corresponding dtypes. With deep
271 memory introspection, a real memory usage calculation is performed
272 at the cost of computational resources. See the
273 :ref:`Frequently Asked Questions <df-memory-usage>` for more
274 details.
275 {show_counts_sub}
276
277 Returns
278 -------
279 None
280 This method prints a summary of a {klass} and returns None.
281
282 See Also
283 --------
284 {see_also_sub}
285
286 Examples
287 --------
288 {examples_sub}
289 """
290)
291
292
293def _put_str(s: str | Dtype, space: int) -> str:
294 """
295 Make string of specified length, padding to the right if necessary.
296
297 Parameters
298 ----------
299 s : Union[str, Dtype]
300 String to be formatted.
301 space : int
302 Length to force string to be of.
303
304 Returns
305 -------
306 str
307 String coerced to given length.
308
309 Examples
310 --------
311 >>> pd.io.formats.info._put_str("panda", 6)
312 'panda '
313 >>> pd.io.formats.info._put_str("panda", 4)
314 'pand'
315 """
316 return str(s)[:space].ljust(space)
317
318
319def _sizeof_fmt(num: float, size_qualifier: str) -> str:
320 """
321 Return size in human readable format.
322
323 Parameters
324 ----------
325 num : int
326 Size in bytes.
327 size_qualifier : str
328 Either empty, or '+' (if lower bound).
329
330 Returns
331 -------
332 str
333 Size in human readable format.
334
335 Examples
336 --------
337 >>> _sizeof_fmt(23028, '')
338 '22.5 KB'
339
340 >>> _sizeof_fmt(23028, '+')
341 '22.5+ KB'
342 """
343 for x in ["bytes", "KB", "MB", "GB", "TB"]:
344 if num < 1024.0:
345 return f"{num:3.1f}{size_qualifier} {x}"
346 num /= 1024.0
347 return f"{num:3.1f}{size_qualifier} PB"
348
349
350def _initialize_memory_usage(
351 memory_usage: bool | str | None = None,
352) -> bool | str:
353 """Get memory usage based on inputs and display options."""
354 if memory_usage is None:
355 memory_usage = get_option("display.memory_usage")
356 return memory_usage
357
358
359class _BaseInfo(ABC):
360 """
361 Base class for DataFrameInfo and SeriesInfo.
362
363 Parameters
364 ----------
365 data : DataFrame or Series
366 Either dataframe or series.
367 memory_usage : bool or str, optional
368 If "deep", introspect the data deeply by interrogating object dtypes
369 for system-level memory consumption, and include it in the returned
370 values.
371 """
372
373 data: DataFrame | Series
374 memory_usage: bool | str
375
376 @property
377 @abstractmethod
378 def dtypes(self) -> Iterable[Dtype]:
379 """
380 Dtypes.
381
382 Returns
383 -------
384 dtypes : sequence
385 Dtype of each of the DataFrame's columns (or one series column).
386 """
387
388 @property
389 @abstractmethod
390 def dtype_counts(self) -> Mapping[str, int]:
391 """Mapping dtype - number of counts."""
392
393 @property
394 @abstractmethod
395 def non_null_counts(self) -> Sequence[int]:
396 """Sequence of non-null counts for all columns or column (if series)."""
397
398 @property
399 @abstractmethod
400 def memory_usage_bytes(self) -> int:
401 """
402 Memory usage in bytes.
403
404 Returns
405 -------
406 memory_usage_bytes : int
407 Object's total memory usage in bytes.
408 """
409
410 @property
411 def memory_usage_string(self) -> str:
412 """Memory usage in a form of human readable string."""
413 return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"
414
415 @property
416 def size_qualifier(self) -> str:
417 size_qualifier = ""
418 if self.memory_usage:
419 if self.memory_usage != "deep":
420 # size_qualifier is just a best effort; not guaranteed to catch
421 # all cases (e.g., it misses categorical data even with object
422 # categories)
423 if (
424 "object" in self.dtype_counts
425 or self.data.index._is_memory_usage_qualified()
426 ):
427 size_qualifier = "+"
428 return size_qualifier
429
430 @abstractmethod
431 def render(
432 self,
433 *,
434 buf: WriteBuffer[str] | None,
435 max_cols: int | None,
436 verbose: bool | None,
437 show_counts: bool | None,
438 ) -> None:
439 pass
440
441
442class DataFrameInfo(_BaseInfo):
443 """
444 Class storing dataframe-specific info.
445 """
446
447 def __init__(
448 self,
449 data: DataFrame,
450 memory_usage: bool | str | None = None,
451 ) -> None:
452 self.data: DataFrame = data
453 self.memory_usage = _initialize_memory_usage(memory_usage)
454
455 @property
456 def dtype_counts(self) -> Mapping[str, int]:
457 return _get_dataframe_dtype_counts(self.data)
458
459 @property
460 def dtypes(self) -> Iterable[Dtype]:
461 """
462 Dtypes.
463
464 Returns
465 -------
466 dtypes
467 Dtype of each of the DataFrame's columns.
468 """
469 return self.data.dtypes
470
471 @property
472 def ids(self) -> Index:
473 """
474 Column names.
475
476 Returns
477 -------
478 ids : Index
479 DataFrame's column names.
480 """
481 return self.data.columns
482
483 @property
484 def col_count(self) -> int:
485 """Number of columns to be summarized."""
486 return len(self.ids)
487
488 @property
489 def non_null_counts(self) -> Sequence[int]:
490 """Sequence of non-null counts for all columns or column (if series)."""
491 return self.data.count()
492
493 @property
494 def memory_usage_bytes(self) -> int:
495 deep = self.memory_usage == "deep"
496 return self.data.memory_usage(index=True, deep=deep).sum()
497
498 def render(
499 self,
500 *,
501 buf: WriteBuffer[str] | None,
502 max_cols: int | None,
503 verbose: bool | None,
504 show_counts: bool | None,
505 ) -> None:
506 printer = _DataFrameInfoPrinter(
507 info=self,
508 max_cols=max_cols,
509 verbose=verbose,
510 show_counts=show_counts,
511 )
512 printer.to_buffer(buf)
513
514
515class SeriesInfo(_BaseInfo):
516 """
517 Class storing series-specific info.
518 """
519
520 def __init__(
521 self,
522 data: Series,
523 memory_usage: bool | str | None = None,
524 ) -> None:
525 self.data: Series = data
526 self.memory_usage = _initialize_memory_usage(memory_usage)
527
528 def render(
529 self,
530 *,
531 buf: WriteBuffer[str] | None = None,
532 max_cols: int | None = None,
533 verbose: bool | None = None,
534 show_counts: bool | None = None,
535 ) -> None:
536 if max_cols is not None:
537 raise ValueError(
538 "Argument `max_cols` can only be passed "
539 "in DataFrame.info, not Series.info"
540 )
541 printer = _SeriesInfoPrinter(
542 info=self,
543 verbose=verbose,
544 show_counts=show_counts,
545 )
546 printer.to_buffer(buf)
547
548 @property
549 def non_null_counts(self) -> Sequence[int]:
550 return [self.data.count()]
551
552 @property
553 def dtypes(self) -> Iterable[Dtype]:
554 return [self.data.dtypes]
555
556 @property
557 def dtype_counts(self) -> Mapping[str, int]:
558 from pandas.core.frame import DataFrame
559
560 return _get_dataframe_dtype_counts(DataFrame(self.data))
561
562 @property
563 def memory_usage_bytes(self) -> int:
564 """Memory usage in bytes.
565
566 Returns
567 -------
568 memory_usage_bytes : int
569 Object's total memory usage in bytes.
570 """
571 deep = self.memory_usage == "deep"
572 return self.data.memory_usage(index=True, deep=deep)
573
574
575class _InfoPrinterAbstract:
576 """
577 Class for printing dataframe or series info.
578 """
579
580 def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
581 """Save dataframe info into buffer."""
582 table_builder = self._create_table_builder()
583 lines = table_builder.get_lines()
584 if buf is None: # pragma: no cover
585 buf = sys.stdout
586 fmt.buffer_put_lines(buf, lines)
587
588 @abstractmethod
589 def _create_table_builder(self) -> _TableBuilderAbstract:
590 """Create instance of table builder."""
591
592
593class _DataFrameInfoPrinter(_InfoPrinterAbstract):
594 """
595 Class for printing dataframe info.
596
597 Parameters
598 ----------
599 info : DataFrameInfo
600 Instance of DataFrameInfo.
601 max_cols : int, optional
602 When to switch from the verbose to the truncated output.
603 verbose : bool, optional
604 Whether to print the full summary.
605 show_counts : bool, optional
606 Whether to show the non-null counts.
607 """
608
609 def __init__(
610 self,
611 info: DataFrameInfo,
612 max_cols: int | None = None,
613 verbose: bool | None = None,
614 show_counts: bool | None = None,
615 ) -> None:
616 self.info = info
617 self.data = info.data
618 self.verbose = verbose
619 self.max_cols = self._initialize_max_cols(max_cols)
620 self.show_counts = self._initialize_show_counts(show_counts)
621
622 @property
623 def max_rows(self) -> int:
624 """Maximum info rows to be displayed."""
625 return get_option("display.max_info_rows", len(self.data) + 1)
626
627 @property
628 def exceeds_info_cols(self) -> bool:
629 """Check if number of columns to be summarized does not exceed maximum."""
630 return bool(self.col_count > self.max_cols)
631
632 @property
633 def exceeds_info_rows(self) -> bool:
634 """Check if number of rows to be summarized does not exceed maximum."""
635 return bool(len(self.data) > self.max_rows)
636
637 @property
638 def col_count(self) -> int:
639 """Number of columns to be summarized."""
640 return self.info.col_count
641
642 def _initialize_max_cols(self, max_cols: int | None) -> int:
643 if max_cols is None:
644 return get_option("display.max_info_columns", self.col_count + 1)
645 return max_cols
646
647 def _initialize_show_counts(self, show_counts: bool | None) -> bool:
648 if show_counts is None:
649 return bool(not self.exceeds_info_cols and not self.exceeds_info_rows)
650 else:
651 return show_counts
652
653 def _create_table_builder(self) -> _DataFrameTableBuilder:
654 """
655 Create instance of table builder based on verbosity and display settings.
656 """
657 if self.verbose:
658 return _DataFrameTableBuilderVerbose(
659 info=self.info,
660 with_counts=self.show_counts,
661 )
662 elif self.verbose is False: # specifically set to False, not necessarily None
663 return _DataFrameTableBuilderNonVerbose(info=self.info)
664 elif self.exceeds_info_cols:
665 return _DataFrameTableBuilderNonVerbose(info=self.info)
666 else:
667 return _DataFrameTableBuilderVerbose(
668 info=self.info,
669 with_counts=self.show_counts,
670 )
671
672
673class _SeriesInfoPrinter(_InfoPrinterAbstract):
674 """Class for printing series info.
675
676 Parameters
677 ----------
678 info : SeriesInfo
679 Instance of SeriesInfo.
680 verbose : bool, optional
681 Whether to print the full summary.
682 show_counts : bool, optional
683 Whether to show the non-null counts.
684 """
685
686 def __init__(
687 self,
688 info: SeriesInfo,
689 verbose: bool | None = None,
690 show_counts: bool | None = None,
691 ) -> None:
692 self.info = info
693 self.data = info.data
694 self.verbose = verbose
695 self.show_counts = self._initialize_show_counts(show_counts)
696
697 def _create_table_builder(self) -> _SeriesTableBuilder:
698 """
699 Create instance of table builder based on verbosity.
700 """
701 if self.verbose or self.verbose is None:
702 return _SeriesTableBuilderVerbose(
703 info=self.info,
704 with_counts=self.show_counts,
705 )
706 else:
707 return _SeriesTableBuilderNonVerbose(info=self.info)
708
709 def _initialize_show_counts(self, show_counts: bool | None) -> bool:
710 if show_counts is None:
711 return True
712 else:
713 return show_counts
714
715
716class _TableBuilderAbstract(ABC):
717 """
718 Abstract builder for info table.
719 """
720
721 _lines: list[str]
722 info: _BaseInfo
723
724 @abstractmethod
725 def get_lines(self) -> list[str]:
726 """Product in a form of list of lines (strings)."""
727
728 @property
729 def data(self) -> DataFrame | Series:
730 return self.info.data
731
732 @property
733 def dtypes(self) -> Iterable[Dtype]:
734 """Dtypes of each of the DataFrame's columns."""
735 return self.info.dtypes
736
737 @property
738 def dtype_counts(self) -> Mapping[str, int]:
739 """Mapping dtype - number of counts."""
740 return self.info.dtype_counts
741
742 @property
743 def display_memory_usage(self) -> bool:
744 """Whether to display memory usage."""
745 return bool(self.info.memory_usage)
746
747 @property
748 def memory_usage_string(self) -> str:
749 """Memory usage string with proper size qualifier."""
750 return self.info.memory_usage_string
751
752 @property
753 def non_null_counts(self) -> Sequence[int]:
754 return self.info.non_null_counts
755
756 def add_object_type_line(self) -> None:
757 """Add line with string representation of dataframe to the table."""
758 self._lines.append(str(type(self.data)))
759
760 def add_index_range_line(self) -> None:
761 """Add line with range of indices to the table."""
762 self._lines.append(self.data.index._summary())
763
764 def add_dtypes_line(self) -> None:
765 """Add summary line with dtypes present in dataframe."""
766 collected_dtypes = [
767 f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items())
768 ]
769 self._lines.append(f"dtypes: {', '.join(collected_dtypes)}")
770
771
772class _DataFrameTableBuilder(_TableBuilderAbstract):
773 """
774 Abstract builder for dataframe info table.
775
776 Parameters
777 ----------
778 info : DataFrameInfo.
779 Instance of DataFrameInfo.
780 """
781
782 def __init__(self, *, info: DataFrameInfo) -> None:
783 self.info: DataFrameInfo = info
784
785 def get_lines(self) -> list[str]:
786 self._lines = []
787 if self.col_count == 0:
788 self._fill_empty_info()
789 else:
790 self._fill_non_empty_info()
791 return self._lines
792
793 def _fill_empty_info(self) -> None:
794 """Add lines to the info table, pertaining to empty dataframe."""
795 self.add_object_type_line()
796 self.add_index_range_line()
797 self._lines.append(f"Empty {type(self.data).__name__}\n")
798
799 @abstractmethod
800 def _fill_non_empty_info(self) -> None:
801 """Add lines to the info table, pertaining to non-empty dataframe."""
802
803 @property
804 def data(self) -> DataFrame:
805 """DataFrame."""
806 return self.info.data
807
808 @property
809 def ids(self) -> Index:
810 """Dataframe columns."""
811 return self.info.ids
812
813 @property
814 def col_count(self) -> int:
815 """Number of dataframe columns to be summarized."""
816 return self.info.col_count
817
818 def add_memory_usage_line(self) -> None:
819 """Add line containing memory usage."""
820 self._lines.append(f"memory usage: {self.memory_usage_string}")
821
822
823class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder):
824 """
825 Dataframe info table builder for non-verbose output.
826 """
827
828 def _fill_non_empty_info(self) -> None:
829 """Add lines to the info table, pertaining to non-empty dataframe."""
830 self.add_object_type_line()
831 self.add_index_range_line()
832 self.add_columns_summary_line()
833 self.add_dtypes_line()
834 if self.display_memory_usage:
835 self.add_memory_usage_line()
836
837 def add_columns_summary_line(self) -> None:
838 self._lines.append(self.ids._summary(name="Columns"))
839
840
841class _TableBuilderVerboseMixin(_TableBuilderAbstract):
842 """
843 Mixin for verbose info output.
844 """
845
846 SPACING: str = " " * 2
847 strrows: Sequence[Sequence[str]]
848 gross_column_widths: Sequence[int]
849 with_counts: bool
850
851 @property
852 @abstractmethod
853 def headers(self) -> Sequence[str]:
854 """Headers names of the columns in verbose table."""
855
856 @property
857 def header_column_widths(self) -> Sequence[int]:
858 """Widths of header columns (only titles)."""
859 return [len(col) for col in self.headers]
860
861 def _get_gross_column_widths(self) -> Sequence[int]:
862 """Get widths of columns containing both headers and actual content."""
863 body_column_widths = self._get_body_column_widths()
864 return [
865 max(*widths)
866 for widths in zip(self.header_column_widths, body_column_widths)
867 ]
868
869 def _get_body_column_widths(self) -> Sequence[int]:
870 """Get widths of table content columns."""
871 strcols: Sequence[Sequence[str]] = list(zip(*self.strrows))
872 return [max(len(x) for x in col) for col in strcols]
873
874 def _gen_rows(self) -> Iterator[Sequence[str]]:
875 """
876 Generator function yielding rows content.
877
878 Each element represents a row comprising a sequence of strings.
879 """
880 if self.with_counts:
881 return self._gen_rows_with_counts()
882 else:
883 return self._gen_rows_without_counts()
884
885 @abstractmethod
886 def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
887 """Iterator with string representation of body data with counts."""
888
889 @abstractmethod
890 def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
891 """Iterator with string representation of body data without counts."""
892
893 def add_header_line(self) -> None:
894 header_line = self.SPACING.join(
895 [
896 _put_str(header, col_width)
897 for header, col_width in zip(self.headers, self.gross_column_widths)
898 ]
899 )
900 self._lines.append(header_line)
901
902 def add_separator_line(self) -> None:
903 separator_line = self.SPACING.join(
904 [
905 _put_str("-" * header_colwidth, gross_colwidth)
906 for header_colwidth, gross_colwidth in zip(
907 self.header_column_widths, self.gross_column_widths
908 )
909 ]
910 )
911 self._lines.append(separator_line)
912
913 def add_body_lines(self) -> None:
914 for row in self.strrows:
915 body_line = self.SPACING.join(
916 [
917 _put_str(col, gross_colwidth)
918 for col, gross_colwidth in zip(row, self.gross_column_widths)
919 ]
920 )
921 self._lines.append(body_line)
922
923 def _gen_non_null_counts(self) -> Iterator[str]:
924 """Iterator with string representation of non-null counts."""
925 for count in self.non_null_counts:
926 yield f"{count} non-null"
927
928 def _gen_dtypes(self) -> Iterator[str]:
929 """Iterator with string representation of column dtypes."""
930 for dtype in self.dtypes:
931 yield pprint_thing(dtype)
932
933
934class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin):
935 """
936 Dataframe info table builder for verbose output.
937 """
938
939 def __init__(
940 self,
941 *,
942 info: DataFrameInfo,
943 with_counts: bool,
944 ) -> None:
945 self.info = info
946 self.with_counts = with_counts
947 self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
948 self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
949
950 def _fill_non_empty_info(self) -> None:
951 """Add lines to the info table, pertaining to non-empty dataframe."""
952 self.add_object_type_line()
953 self.add_index_range_line()
954 self.add_columns_summary_line()
955 self.add_header_line()
956 self.add_separator_line()
957 self.add_body_lines()
958 self.add_dtypes_line()
959 if self.display_memory_usage:
960 self.add_memory_usage_line()
961
962 @property
963 def headers(self) -> Sequence[str]:
964 """Headers names of the columns in verbose table."""
965 if self.with_counts:
966 return [" # ", "Column", "Non-Null Count", "Dtype"]
967 return [" # ", "Column", "Dtype"]
968
969 def add_columns_summary_line(self) -> None:
970 self._lines.append(f"Data columns (total {self.col_count} columns):")
971
972 def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
973 """Iterator with string representation of body data without counts."""
974 yield from zip(
975 self._gen_line_numbers(),
976 self._gen_columns(),
977 self._gen_dtypes(),
978 )
979
980 def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
981 """Iterator with string representation of body data with counts."""
982 yield from zip(
983 self._gen_line_numbers(),
984 self._gen_columns(),
985 self._gen_non_null_counts(),
986 self._gen_dtypes(),
987 )
988
989 def _gen_line_numbers(self) -> Iterator[str]:
990 """Iterator with string representation of column numbers."""
991 for i, _ in enumerate(self.ids):
992 yield f" {i}"
993
994 def _gen_columns(self) -> Iterator[str]:
995 """Iterator with string representation of column names."""
996 for col in self.ids:
997 yield pprint_thing(col)
998
999
1000class _SeriesTableBuilder(_TableBuilderAbstract):
1001 """
1002 Abstract builder for series info table.
1003
1004 Parameters
1005 ----------
1006 info : SeriesInfo.
1007 Instance of SeriesInfo.
1008 """
1009
1010 def __init__(self, *, info: SeriesInfo) -> None:
1011 self.info: SeriesInfo = info
1012
1013 def get_lines(self) -> list[str]:
1014 self._lines = []
1015 self._fill_non_empty_info()
1016 return self._lines
1017
1018 @property
1019 def data(self) -> Series:
1020 """Series."""
1021 return self.info.data
1022
1023 def add_memory_usage_line(self) -> None:
1024 """Add line containing memory usage."""
1025 self._lines.append(f"memory usage: {self.memory_usage_string}")
1026
1027 @abstractmethod
1028 def _fill_non_empty_info(self) -> None:
1029 """Add lines to the info table, pertaining to non-empty series."""
1030
1031
1032class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder):
1033 """
1034 Series info table builder for non-verbose output.
1035 """
1036
1037 def _fill_non_empty_info(self) -> None:
1038 """Add lines to the info table, pertaining to non-empty series."""
1039 self.add_object_type_line()
1040 self.add_index_range_line()
1041 self.add_dtypes_line()
1042 if self.display_memory_usage:
1043 self.add_memory_usage_line()
1044
1045
1046class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin):
1047 """
1048 Series info table builder for verbose output.
1049 """
1050
1051 def __init__(
1052 self,
1053 *,
1054 info: SeriesInfo,
1055 with_counts: bool,
1056 ) -> None:
1057 self.info = info
1058 self.with_counts = with_counts
1059 self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
1060 self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
1061
1062 def _fill_non_empty_info(self) -> None:
1063 """Add lines to the info table, pertaining to non-empty series."""
1064 self.add_object_type_line()
1065 self.add_index_range_line()
1066 self.add_series_name_line()
1067 self.add_header_line()
1068 self.add_separator_line()
1069 self.add_body_lines()
1070 self.add_dtypes_line()
1071 if self.display_memory_usage:
1072 self.add_memory_usage_line()
1073
1074 def add_series_name_line(self) -> None:
1075 self._lines.append(f"Series name: {self.data.name}")
1076
1077 @property
1078 def headers(self) -> Sequence[str]:
1079 """Headers names of the columns in verbose table."""
1080 if self.with_counts:
1081 return ["Non-Null Count", "Dtype"]
1082 return ["Dtype"]
1083
1084 def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
1085 """Iterator with string representation of body data without counts."""
1086 yield from self._gen_dtypes()
1087
1088 def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
1089 """Iterator with string representation of body data with counts."""
1090 yield from zip(
1091 self._gen_non_null_counts(),
1092 self._gen_dtypes(),
1093 )
1094
1095
1096def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]:
1097 """
1098 Create mapping between datatypes and their number of occurrences.
1099 """
1100 # groupby dtype.name to collect e.g. Categorical columns
1101 return df.dtypes.value_counts().groupby(lambda x: x.name).sum()