Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import numpy as np
from sdgym.constants import CATEGORICAL, ORDINAL
from sdgym.synthesizers.base import BaseSynthesizer
from sdgym.synthesizers.utils import Transformer
LOGGER = logging.getLogger(__name__)
def try_mkdirs(dir):
if not os.path.isdir(dir):
os.makedirs(dir)
class PrivBNSynthesizer(BaseSynthesizer):
"""docstring for IdentitySynthesizer."""
def __init__(self, theta=20, max_samples=25000):
assert os.path.exists("privbayes/privBayes.bin")
self.theta = theta
self.max_samples = max_samples
def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
self.data = data.copy()
self.meta = Transformer.get_metadata(data, categorical_columns, ordinal_columns)
def sample(self, n):
try_mkdirs("__privbn_tmp/data")
try_mkdirs("__privbn_tmp/log")
try_mkdirs("__privbn_tmp/output")
shutil.copy("privbayes/privBayes.bin", "__privbn_tmp/privBayes.bin")
import pandas as pd
from sdgym.synthesizers.base import BaseSynthesizer
class IdentitySynthesizer(BaseSynthesizer):
"""Trivial synthesizer.
Returns the same exact data that is used to fit it.
"""
def fit(self, train_data, *args):
self.data = pd.DataFrame(train_data)
def sample(self, samples):
return self.data.sample(samples, replace=True).values
import shutil
import subprocess
import numpy as np
from sdgym.constants import CATEGORICAL, ORDINAL
from sdgym.synthesizers.base import BaseSynthesizer
from sdgym.synthesizers.utils import Transformer
def try_mkdirs(dir):
if not os.path.isdir(dir):
os.makedirs(dir)
class PrivBNSynthesizer(BaseSynthesizer):
"""docstring for IdentitySynthesizer."""
def __init__(self):
assert os.path.exists("privbayes/privBayes.bin")
def fit(self, data, categoricals=tuple(), ordinals=tuple()):
self.data = data.copy()
self.meta = Transformer.get_metadata(data, categoricals, ordinals)
def sample(self, n):
try_mkdirs("__privbn_tmp/data")
try_mkdirs("__privbn_tmp/log")
try_mkdirs("__privbn_tmp/output")
shutil.copy("privbayes/privBayes.bin", "__privbn_tmp/privBayes.bin")
d_cols = []
with open("__privbn_tmp/data/real.domain", "w") as f:
layers_C += [Conv2d(layer_dims[-1][0], 1, layer_dims[-1][1], 1, 0)]
return layers_D, layers_G, layers_C
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
init.normal_(m.weight.data, 0.0, 0.02)
elif classname.find('BatchNorm') != -1:
init.normal_(m.weight.data, 1.0, 0.02)
init.constant_(m.bias.data, 0)
class TableganSynthesizer(BaseSynthesizer):
"""docstring for TableganSynthesizer??"""
def __init__(self,
random_dim=100,
num_channels=64,
l2scale=1e-5,
batch_size=500,
epochs=300):
self.random_dim = random_dim
self.num_channels = num_channels
self.l2scale = l2scale
self.batch_size = batch_size
self.epochs = epochs
import numpy as np
from sdgym.constants import CONTINUOUS
from sdgym.synthesizers.base import BaseSynthesizer
from sdgym.synthesizers.utils import Transformer
class UniformSynthesizer(BaseSynthesizer):
"""UniformSynthesizer."""
def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
self.dtype = data.dtype
self.shape = data.shape
self.meta = Transformer.get_metadata(data, categorical_columns, ordinal_columns)
def sample(self, samples):
data = np.random.uniform(0, 1, (samples, self.shape[1]))
for i, c in enumerate(self.meta):
if c['type'] == CONTINUOUS:
data[:, i] = data[:, i] * (c['max'] - c['min']) + c['min']
else:
data[:, i] = (data[:, i] * (1 - 1e-8) * c['size']).astype('int32')
import json
import numpy as np
from pomegranate import BayesianNetwork, ConditionalProbabilityTable, DiscreteDistribution
from sdgym.synthesizers.base import BaseSynthesizer
from sdgym.synthesizers.utils import DiscretizeTransformer
class CLBNSynthesizer(BaseSynthesizer):
"""CLBNSynthesizer."""
def fit(self, data, categoricals=tuple(), ordinals=tuple()):
self.discretizer = DiscretizeTransformer(n_bins=15)
self.discretizer.fit(data, categoricals, ordinals)
discretized_data = self.discretizer.transform(data)
self.model = BayesianNetwork.from_samples(discretized_data, algorithm='chow-liu')
def bn_sample(self, num_samples):
"""Sample from the bayesian network.
Args:
num_samples(int): Number of samples to generate.
"""
nodes_parents = self.model.structure
processing_order = []
ed = st + item[0]
data_t.append(torch.tanh(data[:, st:ed]))
st = ed
elif item[1] == 'softmax':
ed = st + item[0]
data_t.append(softmax(data[:, st:ed], dim=1))
st = ed
else:
assert 0
return torch.cat(data_t, dim=1)
class VEEGANSynthesizer(BaseSynthesizer):
"""VEEGANSynthesizer."""
def __init__(
self,
embedding_dim=32,
gen_dim=(128, 128),
dis_dim=(128, ),
rec_dim=(128, 128),
l2scale=1e-6,
batch_size=500,
epochs=300
):
self.embedding_dim = embedding_dim
self.gen_dim = gen_dim
self.dis_dim = dis_dim
# interpolates = torch.Variable(interpolates, requires_grad=True, device=device)
disc_interpolates = netD(interpolates)
gradients = torch.autograd.grad(
outputs=disc_interpolates, inputs=interpolates,
grad_outputs=torch.ones(disc_interpolates.size(), device=device),
create_graph=True, retain_graph=True, only_inputs=True)[0]
gradient_penalty = (
(gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1) ** 2).mean() * lambda_
return gradient_penalty
class CTGANSynthesizer(BaseSynthesizer):
"""docstring for IdentitySynthesizer."""
def __init__(self,
embedding_dim=128,
gen_dim=(256, 256),
dis_dim=(256, 256),
l2scale=1e-6,
batch_size=500,
epochs=300):
self.embedding_dim = embedding_dim
self.gen_dim = gen_dim
self.dis_dim = dis_dim
self.l2scale = l2scale
self.batch_size = batch_size
st = ed
elif item[1] == 'softmax':
ed = st + item[0]
loss.append(cross_entropy(
recon_x[:, st:ed], torch.argmax(x[:, st:ed], dim=-1), reduction='sum'))
st = ed
else:
assert 0
assert st == recon_x.size()[1]
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return sum(loss) * factor / x.size()[0], KLD / x.size()[0]
class TVAESynthesizer(BaseSynthesizer):
"""TVAESynthesizer."""
def __init__(
self,
embedding_dim=128,
compress_dims=(128, 128),
decompress_dims=(128, 128),
l2scale=1e-5,
batch_size=500,
epochs=300
):
self.embedding_dim = embedding_dim
self.compress_dims = compress_dims
self.decompress_dims = decompress_dims
for item in output_info:
if item[1] == 'sigmoid':
ed = st + item[0]
loss.append(mse_loss(sigmoid(fake[:, st:ed]), real[:, st:ed], reduction='sum'))
st = ed
elif item[1] == 'softmax':
ed = st + item[0]
loss.append(cross_entropy(
fake[:, st:ed], torch.argmax(real[:, st:ed], dim=-1), reduction='sum'))
st = ed
else:
assert 0
return sum(loss) / fake.size()[0]
class MedganSynthesizer(BaseSynthesizer):
"""docstring for IdentitySynthesizer."""
def __init__(self,
embedding_dim=128,
random_dim=128,
generator_dims=(128, 128), # 128 -> 128 -> 128
discriminator_dims=(256, 128, 1), # datadim * 2 -> 256 -> 128 -> 1
compress_dims=(), # datadim -> embedding_dim
decompress_dims=(), # embedding_dim -> datadim
bn_decay=0.99,
l2scale=0.001,
pretrain_epoch=200,
batch_size=1000,
epochs=2000):
self.embedding_dim = embedding_dim