Coverage for /pythoncovmergedfiles/medio/medio/src/airflow/airflow/serialization/serializers/pandas.py: 31%

29 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:35 +0000

1# 

2# Licensed to the Apache Software Foundation (ASF) under one 

3# or more contributor license agreements. See the NOTICE file 

4# distributed with this work for additional information 

5# regarding copyright ownership. The ASF licenses this file 

6# to you under the Apache License, Version 2.0 (the 

7# "License"); you may not use this file except in compliance 

8# with the License. You may obtain a copy of the License at 

9# 

10# http://www.apache.org/licenses/LICENSE-2.0 

11# 

12# Unless required by applicable law or agreed to in writing, 

13# software distributed under the License is distributed on an 

14# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 

15# KIND, either express or implied. See the License for the 

16# specific language governing permissions and limitations 

17# under the License. 

18from __future__ import annotations 

19 

20from typing import TYPE_CHECKING 

21 

22from airflow.utils.module_loading import qualname 

23 

24# lazy loading for performance reasons 

25serializers = [ 

26 "pandas.core.frame.DataFrame", 

27] 

28deserializers = serializers 

29 

30if TYPE_CHECKING: 

31 from pandas import DataFrame 

32 

33 from airflow.serialization.serde import U 

34 

35__version__ = 1 

36 

37 

38def serialize(o: object) -> tuple[U, str, int, bool]: 

39 import pyarrow as pa 

40 from pandas import DataFrame 

41 from pyarrow import parquet as pq 

42 

43 if not isinstance(o, DataFrame): 

44 return "", "", 0, False 

45 

46 # for now, we *always* serialize into in memory 

47 # until we have a generic backend that manages 

48 # sinks 

49 table = pa.Table.from_pandas(o) 

50 buf = pa.BufferOutputStream() 

51 pq.write_table(table, buf, compression="snappy") 

52 

53 return buf.getvalue().hex().decode("utf-8"), qualname(o), __version__, True 

54 

55 

56def deserialize(classname: str, version: int, data: object) -> DataFrame: 

57 if version > __version__: 

58 raise TypeError(f"serialized {version} of {classname} > {__version__}") 

59 

60 import io 

61 

62 from pyarrow import parquet as pq 

63 

64 if not isinstance(data, str): 

65 raise TypeError(f"serialized {classname} has wrong data type {type(data)}") 

66 

67 buf = io.BytesIO(bytes.fromhex(data)) 

68 df = pq.read_table(buf).to_pandas() 

69 

70 return df