synthesized3 package#

class synthesized3.PandasDataInterface#

Bases: DataInterface

Data interface for spark dataframes.

Allows the SDK to work with Pandas DataFrames.

Example

>>> from synthesized3.utils.docs import get_example_pandas_df
>>> # Util method to get an example pandas DataFrame
>>> df = get_example_pandas_df()
>>> data_interface = PandasDataInterface(df)
>>> data_interface.columns
['x_nans', 'age', 'gender', 'income', 'DOB']

__init__(df: DataFrame)#: verify that the number of columns is allowed with the current license

property raw_dataframe: DataFrame#

property columns: Sequence[str]#: The column names of the connected dataset.

property num_rows: int#: The total number of rows of the connected dataset.

class synthesized3.TableSynthesizer#

Bases: object

Synthesizer of tabular data

__init__(meta_collection: MetaCollection, transformer_collection: TransformerCollection, inverse_transformer_collection: TransformerCollection, model_collection: ModelCollection)#

classmethod from_data_interface(data_interface: DataInterface, meta_overrides: Mapping[str, Type[Meta]] | None = None, model_overrides: Mapping[Tuple[str] | str, Dict[str, Type[Model] | Dict[str, Any]]] | None = None) → TableSynthesizer#

The primary method users will use to create TableSynthesizer objects

Parameters:

data_interface – DataInterface object.
meta_overrides – Mapping of column names to Meta subclasses to override the default Meta for that column. Defaults to None.
model_overrides – List with mappings of column names to Model subclasses. Defaults to None. This should be a list of dictionaries where the dictionaries have the following structure:

Model overrides structure example:

[
    {<column name or tuple(column names)>:
        {
            'model_type': <model class name>,
            'model_kwargs':<kwargs for the model>
        }
    }
]

Returns:

TableSynthesizer object.

Example:

>>> # Basic usage:
>>> from synthesized3.utils.docs import (
...    get_example_spark_df, get_example_spark_session)
>>> from synthesized3 import SparkDataInterface
>>> # Get an example spark dataframe and spark session using util methods
>>> spark = get_example_spark_session()
>>> df = get_example_spark_df(spark)
>>> data_interface = SparkDataInterface(df)
>>> # Build synthesizer from the data interface
>>> synth = TableSynthesizer.from_data_interface(data_interface)
>>> synth.fit(df=df, epochs=1, steps_per_epoch=1) 
Training epoch ...
>>> synth.sample(10, spark=spark)
DataFrame[x_nans: double, age: double, gender: string, income: double, DOB: string]

>>> # With meta overrides:
>>> from synthesized3.meta.metas import IntegerMeta
>>> # 'age' column is a double but will be treated as integer
>>> meta_overrides = {'age': IntegerMeta}
>>> synth = TableSynthesizer.from_data_interface(
...     data_interface, meta_overrides=meta_overrides
... )
>>> synth.fit(df=df, epochs=1, steps_per_epoch=1) 
Training epoch ...
>>> synth.sample(10, spark=spark)
DataFrame[x_nans: double, age: double, gender: string, income: double, DOB: string]

>>> # With model overrides:
>>> model_overrides = {
...     'gender': {
...         'model_type': 'SamplingModel',
...     }
... }
>>> synth = TableSynthesizer.from_data_interface(
...     data_interface, model_overrides=model_overrides
... )
>>> synth.fit(df=df, epochs=1, steps_per_epoch=1) 
Training epoch ...
>>> synth.sample(10, spark=spark)
DataFrame[x_nans: double, age: double, gender: string, income: double, DOB: string]

classmethod from_meta_collection(meta_collection: MetaCollection, model_overrides: Mapping[Tuple[str] | str, Dict[str, Type[Model] | Dict[str, Any]]] | None = None)#

Method for creating TableSynthesizer objects from a MetaCollection

Parameters:

meta_collection – A MetaCollection object.
model_overrides – A dictionary of Model overrides, optional. Defaults to None.

Returns:

TableSynthesizer object.

fit(df: DataFrame | DataFrame, batch_size: int = 1024, epochs: int = 400, steps_per_epoch: int = 50, num_workers: int = 1, callbacks: Callback | None = None, verbose: int = 1)#

Train the synthesizer. This will train the synthesizer pulling the data from the data_interface.

Parameters:

batch_size – The batch size to use for training. Defaults to 1024.
epochs – The maximum number of epochs to run. Defaults to 400.
steps_per_epoch – The number of steps to run per epoch. Defaults to 50.
num_workers – The number of workers to use for distributed training. Defaults to 1.
callbacks – A list of callbacks to use for training. Defaults to None.
verbose – The verbosity level to use for training. Defaults to 1.

sample(num_rows: int | None = None, seed: int | None = None, **kwargs)#

Synthesize a specific number of rows of dataframe

Parameters:

num_rows – The number of rows to synthesize.
seed – The random seed to use for sampling. Defaults to None.
**kwargs – Additional keyword arguments to pass.

Returns:

A dataframe (type defined by the data interface) of the synthesized data.

classmethod from_df(df: DataFrame | DataFrame)#: The primary helper method users will use to create TableSynthesizer objects allows users to pass in a dataframe instead of a data_interface

class synthesized3.SparkDataInterface#

Bases: DataInterface

Data interface for spark dataframes.

Allows the SDK to work with large datasets that do not fit entirely into memory.

Example

>>> from pyspark.sql import SparkSession
>>> import synthesized_datasets
>>> df = synthesized_datasets.REGRESSION.biased_data.load()
>>> spark = SparkSession.builder.master("local[4]").appName("sdk-spark").getOrCreate()
>>> df = spark.createDataFrame(df)
>>> data = SparkDataInterface(df, buffer_size=10_000)
>>> data.columns
['age', 'gender', 'income']

__init__(df: DataFrame, buffer_size: int = 0, quantile_error: float = 0.1)#

Initialize a data interface using a spark dataframe.

Parameters:

df (pyspark.sql.DataFrame) – The spark dataframe.
buffer_size (int) – Maximum number of rows to use as an in memory buffer. This is used to determine the number of partitions for the spark dataframe. A value of zero implies buffer_size = num_rows.
quantile_error (float) – The acceptable fractional error in the bounds of the calculated quantiles. This comes with a trade off between algorithm speed and accuracy.

property raw_dataframe: DataFrame#

property columns: List[str]#: The column names of the connected dataset.

property num_rows: int#: The total number of rows of the connected dataset.

class synthesized3.ColumnType#

Bases: str, Enum

Class to define the internal dtypes that can be handled.

BOOLEAN = 'bool'#

DATETIME = 'datetime'#

FLOAT = 'float32'#

DOUBLE = 'float64'#

INTEGER = 'int32'#

LONG = 'int64'#

STRING = 'string'#

class synthesized3.Nature#

Bases: Enum

Class to define how a column should be modelled

CONTINUOUS = 'continuous'#

CATEGORICAL = 'categorical'#

Subpackages#

Submodules#

synthesized3.api module#

synthesized3.api.train(df: DataFrame, config: SynthesizerConfig) → TableSynthesizer#

Method used to train a TableSynthesizer model.

Parameters:

df (ps.DataFrame) – Training data
config (SynthesizerConfig) – Configuration object that can be used to configure and
SDK (override publicly available functionality of the) –

Returns:

Trained TableSynthesizer instance

synthesized3.api.generate(synth: TableSynthesizer, config: SynthesizerConfig, spark: SparkSession) → DataFrame#

Method used to generate data from a trained TableSynthesizer instance

Parameters:

synth (TableSynthesizer) – trained TableSynthesizer instance
config (SynthesizerConfig) – Configuration object that can be used to configure and
SDK (override publicly available functionality of the) –
spark (pyspark.sql.SparkSession) – Active Spark Session used to return PySpark DataFrame

synthesized3.cmd_line module#

synthesized3.cmd_line.parse_args()#

synthesized3.cmd_line.synthesize()#: Function used to run the SDK through the command line. Creates or gets a Spark session and then runs an e2e synthesis with training data input as CSV file format.

synthesized3.schema module#

class synthesized3.schema.TrainConfig#

Bases: BaseModel

Configuration to be used to set up the training job.

Parameters:

meta_overrides (dict) – Dictionary where keys are column names and values are
string (Meta types. Meta type is the name of the Meta class as a) –

:param : :param as referenced by calling utils.get_registry: :type as referenced by calling utils.get_registry: Meta :param If not supplied then the meta is automatically inferred.: :param model_overrides: Dictionary where keys are column names and values are :type model_overrides: dict :param either a Model type or a nested dictionary where the key is a Model type and the: :param values are kwargs and their desired value. Model type is the name of the Model class: :param as a string: :type as a string: Model :param as referenced by calling utils.get_registry: :type as referenced by calling utils.get_registry: Model :param Examples:

TrainConfig(

meta_overrides={
“col_A”: “ConstantMeta”, “col_B”, “IntegerMeta”

}, model_overrides={

“col_B”: {“EnumerationModel”: {“start”: 1, “step”: 1}}, “col_C”: “SamplingModel”

}

)

meta_overrides: Dict[str, str]#

model_overrides: Dict[str, str | Dict[str, Dict[str, Any]]]#

classmethod validate_meta_type_is_valid(value)#

classmethod validate_model_type_is_valid(value)#

class synthesized3.schema.GenerateConfig#

Bases: BaseModel

Configuration to be used during data generation.

Parameters:

num_rows (int) – Number of rows to synthesize. If not supplied then
original. (the generated data will have the same number of rows as the) –

num_rows: int | None#

class synthesized3.schema.HyperparameterConfig#

Bases: BaseModel

Configuration object used when fitting the TableSynthesizer model to training data.

Parameters:

batch_size (int) – batch_size used during training
epochs (int) – number of epochs to train for. Default None. If None then training
callbacks. (is managed by) –

batch_size: int#

epochs: int | None#

class synthesized3.schema.SynthesizerConfig#

Bases: BaseModel

Configuration specified for an end-to-end synthesis of data from some original dataset

Parameters:

train (TrainConfig) – configuration for TableSynthesizer set-up
generate (GenerateConfig) – configuration for data generation
hyperparameters (HyperparameterConfig) – configuration for tunable hyperparameters
fitting (used in model) –