Coverage for /pythoncovmergedfiles/medio/medio/src/airflow/airflow/serialization/serializers/pandas.py: 31%
29 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:35 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:35 +0000
1#
2# Licensed to the Apache Software Foundation (ASF) under one
3# or more contributor license agreements. See the NOTICE file
4# distributed with this work for additional information
5# regarding copyright ownership. The ASF licenses this file
6# to you under the Apache License, Version 2.0 (the
7# "License"); you may not use this file except in compliance
8# with the License. You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing,
13# software distributed under the License is distributed on an
14# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15# KIND, either express or implied. See the License for the
16# specific language governing permissions and limitations
17# under the License.
18from __future__ import annotations
20from typing import TYPE_CHECKING
22from airflow.utils.module_loading import qualname
24# lazy loading for performance reasons
25serializers = [
26 "pandas.core.frame.DataFrame",
27]
28deserializers = serializers
30if TYPE_CHECKING:
31 from pandas import DataFrame
33 from airflow.serialization.serde import U
35__version__ = 1
38def serialize(o: object) -> tuple[U, str, int, bool]:
39 import pyarrow as pa
40 from pandas import DataFrame
41 from pyarrow import parquet as pq
43 if not isinstance(o, DataFrame):
44 return "", "", 0, False
46 # for now, we *always* serialize into in memory
47 # until we have a generic backend that manages
48 # sinks
49 table = pa.Table.from_pandas(o)
50 buf = pa.BufferOutputStream()
51 pq.write_table(table, buf, compression="snappy")
53 return buf.getvalue().hex().decode("utf-8"), qualname(o), __version__, True
56def deserialize(classname: str, version: int, data: object) -> DataFrame:
57 if version > __version__:
58 raise TypeError(f"serialized {version} of {classname} > {__version__}")
60 import io
62 from pyarrow import parquet as pq
64 if not isinstance(data, str):
65 raise TypeError(f"serialized {classname} has wrong data type {type(data)}")
67 buf = io.BytesIO(bytes.fromhex(data))
68 df = pq.read_table(buf).to_pandas()
70 return df