synthesized3.data_interface.data_interfaces package#

class synthesized3.data_interface.data_interfaces.PandasDataInterface#

Bases: DataInterface

Data interface for spark dataframes.

Allows the SDK to work with Pandas DataFrames.

Example

>>> from synthesized3.utils.docs import get_example_pandas_df
>>> # Util method to get an example pandas DataFrame
>>> df = get_example_pandas_df()
>>> data_interface = PandasDataInterface(df)
>>> data_interface.columns
['x_nans', 'age', 'gender', 'income', 'DOB']

__init__(df: DataFrame)#: verify that the number of columns is allowed with the current license

property raw_dataframe: DataFrame#

property columns: Sequence[str]#: The column names of the connected dataset.

property num_rows: int#: The total number of rows of the connected dataset.

class synthesized3.data_interface.data_interfaces.SparkDataInterface#

Bases: DataInterface

Data interface for spark dataframes.

Allows the SDK to work with large datasets that do not fit entirely into memory.

Example

>>> from pyspark.sql import SparkSession
>>> import synthesized_datasets
>>> df = synthesized_datasets.REGRESSION.biased_data.load()
>>> spark = SparkSession.builder.master("local[4]").appName("sdk-spark").getOrCreate()
>>> df = spark.createDataFrame(df)
>>> data = SparkDataInterface(df, buffer_size=10_000)
>>> data.columns
['age', 'gender', 'income']

__init__(df: DataFrame, buffer_size: int = 0, quantile_error: float = 0.1)#

Initialize a data interface using a spark dataframe.

Parameters:

df (pyspark.sql.DataFrame) – The spark dataframe.
buffer_size (int) – Maximum number of rows to use as an in memory buffer. This is used to determine the number of partitions for the spark dataframe. A value of zero implies buffer_size = num_rows.
quantile_error (float) – The acceptable fractional error in the bounds of the calculated quantiles. This comes with a trade off between algorithm speed and accuracy.

property raw_dataframe: DataFrame#

property columns: List[str]#: The column names of the connected dataset.

property num_rows: int#: The total number of rows of the connected dataset.

Submodules#

synthesized3.data_interface.data_interfaces.pandas_data_interface module#

class synthesized3.data_interface.data_interfaces.pandas_data_interface.PandasDataInterface#

Bases: DataInterface

Data interface for spark dataframes.

Allows the SDK to work with Pandas DataFrames.

Example

>>> from synthesized3.utils.docs import get_example_pandas_df
>>> # Util method to get an example pandas DataFrame
>>> df = get_example_pandas_df()
>>> data_interface = PandasDataInterface(df)
>>> data_interface.columns
['x_nans', 'age', 'gender', 'income', 'DOB']

__init__(df: DataFrame)#: verify that the number of columns is allowed with the current license

property raw_dataframe: DataFrame#

property columns: Sequence[str]#: The column names of the connected dataset.

property num_rows: int#: The total number of rows of the connected dataset.

synthesized3.data_interface.data_interfaces.spark_data_interface module#

class synthesized3.data_interface.data_interfaces.spark_data_interface.SparkDataInterface#

Bases: DataInterface

Data interface for spark dataframes.

Allows the SDK to work with large datasets that do not fit entirely into memory.

Example

>>> from pyspark.sql import SparkSession
>>> import synthesized_datasets
>>> df = synthesized_datasets.REGRESSION.biased_data.load()
>>> spark = SparkSession.builder.master("local[4]").appName("sdk-spark").getOrCreate()
>>> df = spark.createDataFrame(df)
>>> data = SparkDataInterface(df, buffer_size=10_000)
>>> data.columns
['age', 'gender', 'income']

__init__(df: DataFrame, buffer_size: int = 0, quantile_error: float = 0.1)#

Initialize a data interface using a spark dataframe.

Parameters:

df (pyspark.sql.DataFrame) – The spark dataframe.
buffer_size (int) – Maximum number of rows to use as an in memory buffer. This is used to determine the number of partitions for the spark dataframe. A value of zero implies buffer_size = num_rows.
quantile_error (float) – The acceptable fractional error in the bounds of the calculated quantiles. This comes with a trade off between algorithm speed and accuracy.

property raw_dataframe: DataFrame#

property columns: List[str]#: The column names of the connected dataset.

property num_rows: int#: The total number of rows of the connected dataset.

synthesized3.data_interface.data_interfaces package#

Submodules#

synthesized3.data_interface.data_interfaces.pandas_data_interface module#

synthesized3.data_interface.data_interfaces.spark_data_interface module#

synthesized3.data_interface.data_interfaces.spark_data_interface_test module#